lang-main/tests/analysis/test_tokens.py
2024-11-26 16:11:25 +01:00

78 lines
2.3 KiB
Python

import pytest
from lang_main import model_loader
from lang_main.analysis import graphs, tokens
from lang_main.types import SpacyModelTypes
SENTENCE = (
'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, '
'um ihm zu helfen. Ich konnte nicht mit ansehen, wie er Probleme beim Tragen '
'seiner Tasche hatte.'
)
@pytest.fixture(scope='module')
def spacy_model():
model = model_loader.load_spacy(
model_name=SpacyModelTypes.DE_CORE_NEWS_SM,
)
return model
def test_pre_clean_word():
string = 'Öl3bad2024prüfung'
assert tokens.pre_clean_word(string) == 'Ölbadprüfung'
def test_is_str_date():
string = '22.05.'
assert tokens.is_str_date(string, fuzzy=True)
string = '22.05.2024'
assert tokens.is_str_date(string)
string = '22-05-2024'
assert tokens.is_str_date(string)
string = '9009090909'
assert not tokens.is_str_date(string)
string = 'hello347'
assert not tokens.is_str_date(string)
# TODO: depends on fixed Constants
def test_obtain_relevant_descendants(spacy_model):
doc = spacy_model(SENTENCE)
sent1 = tuple(doc.sents)[0] # first sentence
word1 = sent1[1] # word "ging" (POS:VERB)
descendants1 = ('ID', '0912393', 'schnell', 'Wiese', 'Menschen')
rel_descs = tokens.obtain_relevant_descendants(word1)
rel_descs = tuple((token.text for token in rel_descs))
assert descendants1 == rel_descs
sent2 = tuple(doc.sents)[1] # first sentence
word2 = sent2[1] # word "konnte" (POS:AUX)
descendants2 = ('Probleme', 'Tragen', 'Tasche')
rel_descs = tokens.obtain_relevant_descendants(word2)
rel_descs = tuple((token.text for token in rel_descs))
assert descendants2 == rel_descs
def test_add_doc_info_to_graph(spacy_model):
doc = spacy_model(SENTENCE)
tk_graph = graphs.TokenGraph()
tokens.add_doc_info_to_graph(tk_graph, doc, weight=2)
assert len(tk_graph.nodes) == 11
assert len(tk_graph.edges) == 16
assert '0912393' in tk_graph.nodes
def test_build_token_graph(
data_merge_similarity_duplicates,
spacy_model,
data_tk_graph_built,
):
tk_graph, _ = tokens.build_token_graph(
data=data_merge_similarity_duplicates,
model=spacy_model,
)
assert len(tk_graph.nodes) == len(data_tk_graph_built.nodes)
assert len(tk_graph.edges) == len(data_tk_graph_built.edges)