lang-main/src/lang_main/analysis/tokens.py

from typing import cast
import re
from itertools import combinations
from collections.abc import Iterator

from dateutil.parser import parse
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame
from tqdm.auto import tqdm

from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import (
    update_graph,
    TokenGraph,
)


# ** Logging
#LOGGING_LEVEL = 'INFO'
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
#logger = logging.getLogger('ihm_analyse.token_analysis')

# ** POS
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])

#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])

# ** TAG
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
TAG_OF_INTEREST: frozenset[str] = frozenset()


# ** obtaining connection in texts

def pre_clean_word(string: str) -> str:

    pattern = r'[^A-Za-zäöüÄÖÜ]+'
    string = re.sub(pattern, '', string)

    return string

# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
def is_str_date(
    string: str,
    fuzzy: bool = False,
) -> bool:
    #print(string)
    try:
        # check if string is a number
        # if length is greater than 8, it is not a date
        int(string)
        if len(string) > 8:
            return False
    except ValueError:
        # not a number
        pass

    try:
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

def obtain_relevant_descendants(
    token: SpacyToken,
) -> Iterator[SpacyToken]:

    for descendant in token.subtree:
        # subtrees contain the token itself
        # if current element is token skip this element
        if descendant == token:
            continue

        # if descendant is a date skip it)
        if is_str_date(string=descendant.text):
            continue

        logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
                      f">>{descendant}<<, POS >>{descendant.pos_}<<"))

        # eliminate cases of cross-references with verbs
        if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
            (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
            continue
        # skip cases in which descendant is indirect POS with others than verbs
        elif descendant.pos_ in POS_INDIRECT:
            continue
        # skip cases in which child has no relevant POS or TAG
        elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
            continue

        yield descendant

        # TODO look at results and fine-tune function accordingly

def add_doc_info_to_graph(
    graph: TokenGraph,
    doc: SpacyDoc,
    weight: int,
) -> None:
    # iterate over sentences
    for sent in doc.sents:
        # iterate over tokens in sentence
        for token in sent:
            # skip tokens which are not relevant
            if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
                continue
            # skip token which are dates or times
            if is_str_date(string=token.text):
                continue

            relevant_descendants = obtain_relevant_descendants(token=token)
            # for non-AUX: add parent <--> descendant pair to graph
            if token.pos_ not in POS_INDIRECT:
                for descendant in relevant_descendants:
                    # add descendant and parent to graph
                    update_graph(
                        graph=graph,
                        parent=token.lemma_,
                        child=descendant.lemma_,
                        weight_connection=weight
                    )
            else:
                # if indirect POS, make connection between all associated words
                combs = combinations(relevant_descendants, r=2)
                for comb in combs:
                    # !! parents and children do not really exist in this case,
                    # !! but only one connection is made
                    update_graph(
                        graph=graph,
                        parent=comb[0].lemma_,
                        child=comb[1].lemma_,
                        weight_connection=weight,
                    )

def build_token_graph(
    data: DataFrame,
    model: GermanSpacyModel,
) -> tuple[TokenGraph]:
    # empty NetworkX directed graph
    #graph = nx.DiGraph()
    graph = TokenGraph()

    for row in tqdm(data.itertuples(), total=len(data)):
        # obtain properties from tuple
        # attribute names must match with preprocessed data
        entry_text = cast(str, row.entry)
        weight = cast(int, row.num_occur)

        # get spacy model output
        doc = model(entry_text)

        add_doc_info_to_graph(
            graph=graph,
            doc=doc,
            weight=weight,
        )

    # metadata
    graph.update_metadata()
    # convert to undirected
    graph.to_undirected()

    return (graph,)