170 lines
5.4 KiB
Python
170 lines
5.4 KiB
Python
from typing import cast
|
|
import re
|
|
from itertools import combinations
|
|
from collections.abc import Iterator
|
|
|
|
from dateutil.parser import parse
|
|
from spacy.tokens.token import Token as SpacyToken
|
|
from spacy.tokens.doc import Doc as SpacyDoc
|
|
from spacy.lang.de import German as GermanSpacyModel
|
|
from pandas import DataFrame
|
|
from tqdm.auto import tqdm
|
|
|
|
from lang_main.loggers import logger_token_analysis as logger
|
|
from lang_main.analysis.graphs import (
|
|
update_graph,
|
|
TokenGraph,
|
|
)
|
|
|
|
|
|
# ** Logging
|
|
#LOGGING_LEVEL = 'INFO'
|
|
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
|
#logger = logging.getLogger('ihm_analyse.token_analysis')
|
|
|
|
# ** POS
|
|
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
|
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
|
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
|
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
|
|
|
#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
|
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
|
|
|
# ** TAG
|
|
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
|
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
|
|
|
|
|
# ** obtaining connection in texts
|
|
|
|
def pre_clean_word(string: str) -> str:
|
|
|
|
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
|
string = re.sub(pattern, '', string)
|
|
|
|
return string
|
|
|
|
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
|
def is_str_date(
|
|
string: str,
|
|
fuzzy: bool = False,
|
|
) -> bool:
|
|
#print(string)
|
|
try:
|
|
# check if string is a number
|
|
# if length is greater than 8, it is not a date
|
|
int(string)
|
|
if len(string) > 8:
|
|
return False
|
|
except ValueError:
|
|
# not a number
|
|
pass
|
|
|
|
try:
|
|
parse(string, fuzzy=fuzzy)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
def obtain_relevant_descendants(
|
|
token: SpacyToken,
|
|
) -> Iterator[SpacyToken]:
|
|
|
|
for descendant in token.subtree:
|
|
# subtrees contain the token itself
|
|
# if current element is token skip this element
|
|
if descendant == token:
|
|
continue
|
|
|
|
# if descendant is a date skip it)
|
|
if is_str_date(string=descendant.text):
|
|
continue
|
|
|
|
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
|
|
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
|
|
|
|
# eliminate cases of cross-references with verbs
|
|
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
|
|
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
|
|
continue
|
|
# skip cases in which descendant is indirect POS with others than verbs
|
|
elif descendant.pos_ in POS_INDIRECT:
|
|
continue
|
|
# skip cases in which child has no relevant POS or TAG
|
|
elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
|
|
continue
|
|
|
|
yield descendant
|
|
|
|
# TODO look at results and fine-tune function accordingly
|
|
|
|
def add_doc_info_to_graph(
|
|
graph: TokenGraph,
|
|
doc: SpacyDoc,
|
|
weight: int,
|
|
) -> None:
|
|
# iterate over sentences
|
|
for sent in doc.sents:
|
|
# iterate over tokens in sentence
|
|
for token in sent:
|
|
# skip tokens which are not relevant
|
|
if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
|
|
continue
|
|
# skip token which are dates or times
|
|
if is_str_date(string=token.text):
|
|
continue
|
|
|
|
relevant_descendants = obtain_relevant_descendants(token=token)
|
|
# for non-AUX: add parent <--> descendant pair to graph
|
|
if token.pos_ not in POS_INDIRECT:
|
|
for descendant in relevant_descendants:
|
|
# add descendant and parent to graph
|
|
update_graph(
|
|
graph=graph,
|
|
parent=token.lemma_,
|
|
child=descendant.lemma_,
|
|
weight_connection=weight
|
|
)
|
|
else:
|
|
# if indirect POS, make connection between all associated words
|
|
combs = combinations(relevant_descendants, r=2)
|
|
for comb in combs:
|
|
# !! parents and children do not really exist in this case,
|
|
# !! but only one connection is made
|
|
update_graph(
|
|
graph=graph,
|
|
parent=comb[0].lemma_,
|
|
child=comb[1].lemma_,
|
|
weight_connection=weight,
|
|
)
|
|
|
|
def build_token_graph(
|
|
data: DataFrame,
|
|
model: GermanSpacyModel,
|
|
) -> tuple[TokenGraph]:
|
|
# empty NetworkX directed graph
|
|
#graph = nx.DiGraph()
|
|
graph = TokenGraph()
|
|
|
|
for row in tqdm(data.itertuples(), total=len(data)):
|
|
# obtain properties from tuple
|
|
# attribute names must match with preprocessed data
|
|
entry_text = cast(str, row.entry)
|
|
weight = cast(int, row.num_occur)
|
|
|
|
# get spacy model output
|
|
doc = model(entry_text)
|
|
|
|
add_doc_info_to_graph(
|
|
graph=graph,
|
|
doc=doc,
|
|
weight=weight,
|
|
)
|
|
|
|
# metadata
|
|
graph.update_metadata()
|
|
# convert to undirected
|
|
graph.to_undirected()
|
|
|
|
return (graph,) |