from typing import cast import re from itertools import combinations from collections.abc import Iterator from dateutil.parser import parse from spacy.tokens.token import Token as SpacyToken from spacy.tokens.doc import Doc as SpacyDoc from spacy.lang.de import German as GermanSpacyModel from pandas import DataFrame from tqdm.auto import tqdm from lang_main.loggers import logger_token_analysis as logger from lang_main.analysis.graphs import ( update_graph, TokenGraph, ) # ** Logging #LOGGING_LEVEL = 'INFO' #logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout) #logger = logging.getLogger('ihm_analyse.token_analysis') # ** POS #POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) #POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) #POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN']) POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX']) #POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB']) POS_INDIRECT: frozenset[str] = frozenset(['AUX']) # ** TAG #TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD']) TAG_OF_INTEREST: frozenset[str] = frozenset() # ** obtaining connection in texts def pre_clean_word(string: str) -> str: pattern = r'[^A-Za-zäöüÄÖÜ]+' string = re.sub(pattern, '', string) return string # https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format def is_str_date( string: str, fuzzy: bool = False, ) -> bool: #print(string) try: # check if string is a number # if length is greater than 8, it is not a date int(string) if len(string) > 8: return False except ValueError: # not a number pass try: parse(string, fuzzy=fuzzy) return True except ValueError: return False def obtain_relevant_descendants( token: SpacyToken, ) -> Iterator[SpacyToken]: for descendant in token.subtree: # subtrees contain the token itself # if current element is token skip this element if descendant == token: continue # if descendant is a date skip it) if is_str_date(string=descendant.text): continue logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant " f">>{descendant}<<, POS >>{descendant.pos_}<<")) # eliminate cases of cross-references with verbs if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')): continue # skip cases in which descendant is indirect POS with others than verbs elif descendant.pos_ in POS_INDIRECT: continue # skip cases in which child has no relevant POS or TAG elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST): continue yield descendant # TODO look at results and fine-tune function accordingly def add_doc_info_to_graph( graph: TokenGraph, doc: SpacyDoc, weight: int, ) -> None: # iterate over sentences for sent in doc.sents: # iterate over tokens in sentence for token in sent: # skip tokens which are not relevant if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST): continue # skip token which are dates or times if is_str_date(string=token.text): continue relevant_descendants = obtain_relevant_descendants(token=token) # for non-AUX: add parent <--> descendant pair to graph if token.pos_ not in POS_INDIRECT: for descendant in relevant_descendants: # add descendant and parent to graph update_graph( graph=graph, parent=token.lemma_, child=descendant.lemma_, weight_connection=weight ) else: # if indirect POS, make connection between all associated words combs = combinations(relevant_descendants, r=2) for comb in combs: # !! parents and children do not really exist in this case, # !! but only one connection is made update_graph( graph=graph, parent=comb[0].lemma_, child=comb[1].lemma_, weight_connection=weight, ) def build_token_graph( data: DataFrame, model: GermanSpacyModel, ) -> tuple[TokenGraph]: # empty NetworkX directed graph #graph = nx.DiGraph() graph = TokenGraph() for row in tqdm(data.itertuples(), total=len(data)): # obtain properties from tuple # attribute names must match with preprocessed data entry_text = cast(str, row.entry) weight = cast(int, row.num_occur) # get spacy model output doc = model(entry_text) add_doc_info_to_graph( graph=graph, doc=doc, weight=weight, ) # metadata graph.update_metadata() # convert to undirected graph.to_undirected() return (graph,)