2024-05-22 18:11:46 +02:00

170 lines
5.4 KiB
Python

from typing import cast
import re
from itertools import combinations
from collections.abc import Iterator
from dateutil.parser import parse
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame
from tqdm.auto import tqdm
from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import (
update_graph,
TokenGraph,
)
# ** Logging
#LOGGING_LEVEL = 'INFO'
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
#logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
# ** TAG
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
TAG_OF_INTEREST: frozenset[str] = frozenset()
# ** obtaining connection in texts
def pre_clean_word(string: str) -> str:
pattern = r'[^A-Za-zäöüÄÖÜ]+'
string = re.sub(pattern, '', string)
return string
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
def is_str_date(
string: str,
fuzzy: bool = False,
) -> bool:
#print(string)
try:
# check if string is a number
# if length is greater than 8, it is not a date
int(string)
if len(string) > 8:
return False
except ValueError:
# not a number
pass
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def obtain_relevant_descendants(
token: SpacyToken,
) -> Iterator[SpacyToken]:
for descendant in token.subtree:
# subtrees contain the token itself
# if current element is token skip this element
if descendant == token:
continue
# if descendant is a date skip it)
if is_str_date(string=descendant.text):
continue
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
# eliminate cases of cross-references with verbs
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
continue
# skip cases in which descendant is indirect POS with others than verbs
elif descendant.pos_ in POS_INDIRECT:
continue
# skip cases in which child has no relevant POS or TAG
elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
continue
yield descendant
# TODO look at results and fine-tune function accordingly
def add_doc_info_to_graph(
graph: TokenGraph,
doc: SpacyDoc,
weight: int,
) -> None:
# iterate over sentences
for sent in doc.sents:
# iterate over tokens in sentence
for token in sent:
# skip tokens which are not relevant
if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
continue
# skip token which are dates or times
if is_str_date(string=token.text):
continue
relevant_descendants = obtain_relevant_descendants(token=token)
# for non-AUX: add parent <--> descendant pair to graph
if token.pos_ not in POS_INDIRECT:
for descendant in relevant_descendants:
# add descendant and parent to graph
update_graph(
graph=graph,
parent=token.lemma_,
child=descendant.lemma_,
weight_connection=weight
)
else:
# if indirect POS, make connection between all associated words
combs = combinations(relevant_descendants, r=2)
for comb in combs:
# !! parents and children do not really exist in this case,
# !! but only one connection is made
update_graph(
graph=graph,
parent=comb[0].lemma_,
child=comb[1].lemma_,
weight_connection=weight,
)
def build_token_graph(
data: DataFrame,
model: GermanSpacyModel,
) -> tuple[TokenGraph]:
# empty NetworkX directed graph
#graph = nx.DiGraph()
graph = TokenGraph()
for row in tqdm(data.itertuples(), total=len(data)):
# obtain properties from tuple
# attribute names must match with preprocessed data
entry_text = cast(str, row.entry)
weight = cast(int, row.num_occur)
# get spacy model output
doc = model(entry_text)
add_doc_info_to_graph(
graph=graph,
doc=doc,
weight=weight,
)
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected()
return (graph,)