from pathlib import Path import pytest from lang_main import model_loader from lang_main.analysis import graphs, tokens from lang_main.types import SpacyModelTypes SENTENCE = ( 'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, ' 'um ihm zu helfen. Ich konnte nicht mit ansehen, wie er Probleme beim Tragen ' 'seiner Tasche hatte.' ) @pytest.fixture(scope='module') def spacy_model(): model = model_loader.load_spacy( model_name=SpacyModelTypes.DE_CORE_NEWS_SM, ) return model def test_pre_clean_word(): string = 'Öl3bad2024prüfung' assert tokens.pre_clean_word(string) == 'Ölbadprüfung' def test_is_str_date(): string = '22.05.' assert tokens.is_str_date(string, fuzzy=True) string = '22.05.2024' assert tokens.is_str_date(string) string = '22-05-2024' assert tokens.is_str_date(string) string = '9009090909' assert not tokens.is_str_date(string) string = 'hello347' assert not tokens.is_str_date(string) # TODO: depends on fixed Constants def test_obtain_relevant_descendants(spacy_model): doc = spacy_model(SENTENCE) sent1 = tuple(doc.sents)[0] # first sentence word1 = sent1[1] # word "ging" (POS:VERB) descendants1 = ('0912393', 'schnell', 'Wiese', 'Menschen') rel_descs = tokens.obtain_relevant_descendants(word1) rel_descs = tuple((token.text for token in rel_descs)) assert descendants1 == rel_descs sent2 = tuple(doc.sents)[1] # first sentence word2 = sent2[1] # word "konnte" (POS:AUX) descendants2 = ('mit', 'Probleme', 'Tragen', 'Tasche') rel_descs = tokens.obtain_relevant_descendants(word2) rel_descs = tuple((token.text for token in rel_descs)) assert descendants2 == rel_descs def test_add_doc_info_to_graph(spacy_model): doc = spacy_model(SENTENCE) tk_graph = graphs.TokenGraph() tokens.add_doc_info_to_graph(tk_graph, doc, weight=2) assert len(tk_graph.nodes) == 11 assert len(tk_graph.edges) == 17 assert '0912393' in tk_graph.nodes def test_build_token_graph( data_merge_similarity_duplicates, spacy_model, data_tk_graph_built, ): tk_graph, _ = tokens.build_token_graph( data=data_merge_similarity_duplicates, model=spacy_model, ) assert len(tk_graph.nodes) == len(data_tk_graph_built.nodes) assert len(tk_graph.edges) == len(data_tk_graph_built.edges)