started adding comprehensive unit tests

2024-11-13 17:54:47 +01:00
parent a0ca71ea87
commit 6781b4a132
32 changed files with 4042 additions and 1430 deletions
--- a/tests/Dummy_Dataset_N_1000.csv
+++ b/tests/Dummy_Dataset_N_1000.csv
--- a/tests/analyse_dataset.xlsx
+++ b/tests/analyse_dataset.xlsx
--- a/tests/analysis/init.py
+++ b/tests/analysis/init.py
--- a/tests/analysis/test_graphs.py
+++ b/tests/analysis/test_graphs.py
@@ -0,0 +1,168 @@
+import networkx as nx
+import pytest
+
+from lang_main.analysis import graphs
+
+TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
+
+
+def build_init_graph(token_graph: bool):
+    edge_weights = [
+        {'weight': 1},
+        {'weight': 2},
+        {'weight': 3},
+        {'weight': 4},
+        {'weight': 5},
+        {'weight': 6},
+    ]
+    edges = [
+        (1, 2),
+        (1, 3),
+        (2, 4),
+        (3, 4),
+        (1, 4),
+        (2, 1),
+    ]
+    edges_to_add = []
+    for i, edge in enumerate(edges):
+        edge = list(edge)
+        edge.append(edge_weights[i])  # type: ignore
+        edges_to_add.append(tuple(edge))
+
+    if token_graph:
+        G = graphs.TokenGraph(name=TK_GRAPH_NAME, enable_logging=False)
+    else:
+        G = nx.DiGraph()
+
+    G.add_edges_from(edges_to_add)
+
+    return G
+
+
+@pytest.fixture(scope='module')
+def graph():
+    return build_init_graph(token_graph=False)
+
+
+@pytest.fixture(scope='module')
+def tk_graph():
+    return build_init_graph(token_graph=True)
+
+
+def test_graph_size(graph):
+    assert len(graph.nodes) == 4
+    assert len(graph.edges) == 6
+
+
+def test_save_to_GraphML(graph, tmp_path):
+    filename = 'test_graphML'
+    graphs.save_to_GraphML(graph, saving_path=tmp_path, filename=filename)
+    saved_file = (tmp_path / filename).with_suffix('.graphml')
+    assert saved_file.exists()
+
+
+def test_metadata_retrieval(graph):
+    metadata = graphs.get_graph_metadata(graph)
+    assert metadata['num_nodes'] == 4
+    assert metadata['num_edges'] == 6
+    assert metadata['min_edge_weight'] == 1
+    assert metadata['max_edge_weight'] == 6
+    assert metadata['node_memory'] == 112
+    assert metadata['edge_memory'] == 336
+    assert metadata['total_memory'] == 448
+
+
+def test_graph_update_batch():
+    graph_obj = build_init_graph(token_graph=False)
+    graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
+    metadata = graphs.get_graph_metadata(graph_obj)
+    assert metadata['num_nodes'] == 6
+    assert metadata['num_edges'] == 8
+    assert metadata['min_edge_weight'] == 1
+    assert metadata['max_edge_weight'] == 8
+
+
+def test_graph_update_single_new():
+    graph_obj = build_init_graph(token_graph=False)
+    graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
+    metadata = graphs.get_graph_metadata(graph_obj)
+    assert metadata['num_nodes'] == 5
+    assert metadata['num_edges'] == 7
+    assert metadata['min_edge_weight'] == 1
+    assert metadata['max_edge_weight'] == 7
+
+
+def test_graph_update_single_existing():
+    graph_obj = build_init_graph(token_graph=False)
+    graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
+    metadata = graphs.get_graph_metadata(graph_obj)
+    assert metadata['num_nodes'] == 4
+    assert metadata['num_edges'] == 6
+    assert metadata['min_edge_weight'] == 1
+    assert metadata['max_edge_weight'] == 10
+
+
+@pytest.mark.parametrize('cast_int', [True, False])
+def test_graph_undirected_conversion(graph, cast_int):
+    graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
+    # edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
+    assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
+
+
+def test_graph_cytoscape_conversion(graph):
+    cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
+    node = cyto_graph[0]
+    edge = cyto_graph[-1]
+    assert node['data']['id'] == 1  # type: ignore
+    assert edge['data']['source'] == 3  # type: ignore
+    assert edge['data']['target'] == 4  # type: ignore
+    assert edge['data']['weight'] == 4  # type: ignore
+    assert weight_data['min'] == 1
+    assert weight_data['max'] == 6
+
+
+def test_tk_graph_properties(tk_graph):
+    assert tk_graph.name == TK_GRAPH_NAME
+    assert isinstance(tk_graph.directed, graphs.TokenGraph)
+    assert isinstance(tk_graph.undirected, nx.Graph)
+    tk_graph.update_metadata()
+    metadata_directed = tk_graph.metadata_directed
+    assert metadata_directed['num_nodes'] == 4
+    assert metadata_directed['num_edges'] == 6
+    assert metadata_directed['min_edge_weight'] == 1
+    assert metadata_directed['max_edge_weight'] == 6
+    assert metadata_directed['node_memory'] == 112
+    assert metadata_directed['edge_memory'] == 336
+    assert metadata_directed['total_memory'] == 448
+    metadata_undirected = tk_graph.metadata_undirected
+    assert metadata_undirected['num_nodes'] == 4
+    assert metadata_undirected['num_edges'] == 5
+    assert metadata_undirected['min_edge_weight'] == 2
+    assert metadata_undirected['max_edge_weight'] == 7
+    assert metadata_undirected['node_memory'] == 112
+    assert metadata_undirected['edge_memory'] == 280
+    assert metadata_undirected['total_memory'] == 392
+
+
+def test_graph_degree_filter(tk_graph):
+    filtered_graph = graphs.filter_graph_by_node_degree(
+        tk_graph,
+        bound_lower=3,
+        bound_upper=3,
+    )
+    assert len(filtered_graph.nodes) == 2
+
+
+def test_graph_edge_number_filter(tk_graph):
+    number_edges_limit = 1
+    filtered_graph = graphs.filter_graph_by_number_edges(
+        tk_graph,
+        limit=number_edges_limit,
+    )
+    assert len(filtered_graph.edges) == number_edges_limit
+    filtered_graph = graphs.filter_graph_by_node_degree(
+        filtered_graph,
+        bound_lower=1,
+        bound_upper=None,
+    )
+    assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'
--- a/tests/analysis/test_preprocessing.py
+++ b/tests/analysis/test_preprocessing.py
@@ -0,0 +1,73 @@
+"""testing each function in a consecutive way like each one is
+executed in in a pipeline
+"""
+
+from lang_main.analysis import preprocessing as ppc
+from lang_main.analysis import shared
+
+
+def test_load_data(raw_data_path, raw_data_date_cols):
+    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
+    assert len(data) == 1000
+
+
+def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols):
+    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
+    (data,) = ppc.remove_duplicates(data)
+    assert len(data) == 999
+
+
+def test_remove_na(raw_data_path, raw_data_date_cols):
+    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
+    (data,) = ppc.remove_duplicates(data)
+    target_features: tuple[str] = ('VorgangsBeschreibung',)
+    (data,) = ppc.remove_NA(data, target_features)
+    assert len(data) == 998
+
+
+# def test_string_cleansing():
+#     string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
+#     cleaned_string = shared.clean_string_slim(string)
+#     target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
+#     assert cleaned_string == target_string
+
+
+def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols):
+    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
+    (data,) = ppc.remove_duplicates(data)
+    target_features: tuple[str] = ('VorgangsBeschreibung',)
+    (data,) = ppc.remove_NA(data, target_features)
+    starting_string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
+    cleaned_string = shared.clean_string_slim(starting_string)
+    target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
+    assert cleaned_string == target_string
+    starting_string = 'Ölleckage   durch\nundichten    Ölsumpf,, aber Dichtung intakt??!!!'
+    assert data.at[0, 'VorgangsBeschreibung'] == starting_string
+    (data,) = shared.entry_wise_cleansing(
+        data,
+        target_features=target_features,
+        cleansing_func=shared.clean_string_slim,
+    )
+    assert data.at[0, 'VorgangsBeschreibung'] == target_string
+
+
+def test_analyse_feature(raw_data_path, raw_data_date_cols):
+    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
+    (data,) = ppc.remove_duplicates(data)
+    target_features: tuple[str] = ('VorgangsBeschreibung',)
+    (data,) = ppc.remove_NA(data, target_features)
+    starting_string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
+    cleaned_string = shared.clean_string_slim(starting_string)
+    target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
+    assert cleaned_string == target_string
+    starting_string = 'Ölleckage   durch\nundichten    Ölsumpf,, aber Dichtung intakt??!!!'
+    assert data.at[0, 'VorgangsBeschreibung'] == starting_string
+    (data,) = shared.entry_wise_cleansing(
+        data,
+        target_features=target_features,
+        cleansing_func=shared.clean_string_slim,
+    )
+    assert data.at[0, 'VorgangsBeschreibung'] == target_string
+
+    (data,) = ppc.analyse_feature(data, target_feature=target_features[0])
+    assert len(data) == 139
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+import pytest
+
+DATE_COLS: tuple[str, ...] = (
+    'VorgangsDatum',
+    'ErledigungsDatum',
+    'Arbeitsbeginn',
+    'ErstellungsDatum',
+)
+
+
+@pytest.fixture(scope='session')
+def raw_data_path():
+    pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
+    assert pth_data.exists()
+
+    return pth_data
+
+
+@pytest.fixture(scope='session')
+def raw_data_date_cols():
+    return DATE_COLS
--- a/tests/lang_main_config.toml
+++ b/tests/lang_main_config.toml
@@ -1,56 +0,0 @@
-# lang_main: Config file
-
-[paths]
-inputs = '../scripts/inputs/'
-results = '../scripts/results/test_new2/'
-dataset = '../data/02_202307/Export4.csv'
-#results = './results/Export7/'
-#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
-#results = './results/Export7_trunc/'
-#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
-
-[control]
-preprocessing = true
-preprocessing_skip = false
-token_analysis = false
-token_analysis_skip = false
-graph_postprocessing = false
-graph_postprocessing_skip = false
-time_analysis = false
-time_analysis_skip = false
-
-#[export_filenames]
-#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
-
-[preprocess]
-filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
-date_cols = [
-    "VorgangsDatum", 
-    "ErledigungsDatum", 
-    "Arbeitsbeginn", 
-    "ErstellungsDatum",
-]
-threshold_amount_characters = 5
-threshold_similarity = 0.8
-
-[graph_postprocessing]
-threshold_edge_weight = 150
-
-[time_analysis.uniqueness]
-threshold_unique_texts = 4
-criterion_feature = 'HObjektText'
-feature_name_obj_id = 'ObjektID'
-
-[time_analysis.model_input]
-input_features = [
-    'VorgangsTypName',
-    'VorgangsArtText',
-    'VorgangsBeschreibung',
-]
-activity_feature = 'VorgangsTypName'
-activity_types = [
-    'Reparaturauftrag (Portal)',
-    'Störungsmeldung',
-]
-threshold_num_acitivities = 1
-threshold_similarity = 0.8
--- a/tests/pipelines/init.py
+++ b/tests/pipelines/init.py
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -0,0 +1,7 @@
+from lang_main import config, pkg_dir
+
+
+def test_load_config():
+    toml_path = pkg_dir / 'lang_main_config.toml'
+    loaded_cfg = config.load_toml_config(toml_path)
+    assert loaded_cfg['info']['pkg'] == 'lang_main'
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -0,0 +1,57 @@
+import pytest
+
+from lang_main import io
+
+CONTENT = 'test_lang_main'
+
+
+@pytest.mark.parametrize(
+    'overwrite',
+    [True, False],
+)
+def test_create_saving_folder(tmp_path, overwrite):
+    target_dir = tmp_path / 'test'
+    assert not target_dir.exists()
+    io.create_saving_folder(target_dir, overwrite_existing=overwrite)
+    assert target_dir.exists()
+    assert target_dir.is_dir()
+
+
+def test_save_load(tmp_path):
+    save_pth = tmp_path / 'test_lang_main.pkl'
+    io.save_pickle(CONTENT, save_pth)
+    loaded = io.load_pickle(save_pth)
+    assert loaded == CONTENT
+    b64_str = io.encode_to_base64_str(CONTENT)
+    b64_str_file = io.encode_file_to_base64_str(save_pth)
+    assert b64_str == b64_str_file
+    b64_decoded = io.decode_from_base64_str(b64_str)
+    assert b64_decoded == CONTENT
+    b64_decoded_file = io.decode_from_base64_str(b64_str_file)
+    assert b64_decoded_file == CONTENT
+
+
+def test_get_entry_point(tmp_path):
+    save_pth = tmp_path / 'test_lang_main.pkl'
+    io.save_pickle(CONTENT, save_pth)
+    pth = io.get_entry_point(
+        tmp_path,
+        'test_lang_main',
+        '.pkl',
+        check_existence=True,
+    )
+    assert pth.exists()
+    with pytest.raises(FileNotFoundError):
+        _ = io.get_entry_point(
+            tmp_path,
+            'test_lang_main2',
+            '.pkl',
+            check_existence=True,
+        )
+    pth = io.get_entry_point(
+        tmp_path,
+        'test_lang_main2',
+        '.pkl',
+        check_existence=False,
+    )
+    assert not pth.exists()
--- a/tests/test_lang_main_init.py
+++ b/tests/test_lang_main_init.py
@@ -0,0 +1,5 @@
+from lang_main import BASE_PATH
+
+
+def test_base_path():
+    assert BASE_PATH is not None
--- a/tests/test_model_loader.py
+++ b/tests/test_model_loader.py
@@ -0,0 +1,113 @@
+import pytest
+from sentence_transformers import SentenceTransformer
+from spacy.language import Language
+
+from lang_main import model_loader
+from lang_main.constants import (
+    STFR_MODEL_ARGS_ONNX,
+    SimilarityFunction,
+    SpacyModelTypes,
+    STFRBackends,
+    STFRDeviceTypes,
+    STFRModelTypes,
+)
+from lang_main.types import LanguageModels
+
+
+@pytest.mark.parametrize(
+    'similarity_func',
+    [
+        SimilarityFunction.COSINE,
+        SimilarityFunction.DOT,
+    ],
+)
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        STFRModelTypes.ALL_DISTILROBERTA_V1,
+        STFRModelTypes.ALL_MINI_LM_L12_V2,
+        STFRModelTypes.ALL_MINI_LM_L6_V2,
+        STFRModelTypes.ALL_MPNET_BASE_V2,
+    ],
+)
+@pytest.mark.mload
+def test_load_sentence_transformer(
+    model_name,
+    similarity_func,
+) -> None:
+    model = model_loader.load_sentence_transformer(
+        model_name=model_name,
+        similarity_func=similarity_func,
+        backend=STFRBackends.TORCH,
+        device=STFRDeviceTypes.CPU,
+        model_kwargs=None,
+    )
+    assert isinstance(model, SentenceTransformer)
+
+
+@pytest.mark.parametrize(
+    'similarity_func',
+    [
+        SimilarityFunction.COSINE,
+        SimilarityFunction.DOT,
+    ],
+)
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        STFRModelTypes.ALL_DISTILROBERTA_V1,
+        STFRModelTypes.ALL_MINI_LM_L12_V2,
+        STFRModelTypes.ALL_MINI_LM_L6_V2,
+        STFRModelTypes.ALL_MPNET_BASE_V2,
+    ],
+)
+@pytest.mark.mload
+def test_load_sentence_transformer_onnx(
+    model_name,
+    similarity_func,
+) -> None:
+    model = model_loader.load_sentence_transformer(
+        model_name=model_name,
+        similarity_func=similarity_func,
+        backend=STFRBackends.ONNX,
+        device=STFRDeviceTypes.CPU,
+        model_kwargs=STFR_MODEL_ARGS_ONNX,  # type: ignore
+    )
+    assert isinstance(model, SentenceTransformer)
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        SpacyModelTypes.DE_CORE_NEWS_SM,
+        SpacyModelTypes.DE_CORE_NEWS_MD,
+        SpacyModelTypes.DE_CORE_NEWS_LG,
+        SpacyModelTypes.DE_DEP_NEWS_TRF,
+    ],
+)
+@pytest.mark.mload
+def test_load_spacy_model(
+    model_name,
+):
+    model = model_loader.load_spacy(
+        model_name=model_name,
+    )
+    assert isinstance(model, Language)
+
+
+@pytest.mark.mload
+def test_instantiate_spacy_model():
+    model = model_loader.instantiate_model(
+        model_load_map=model_loader.MODEL_LOADER_MAP,
+        model=LanguageModels.SPACY,
+    )
+    assert isinstance(model, Language)
+
+
+@pytest.mark.mload
+def test_instantiate_stfr_model():
+    model = model_loader.instantiate_model(
+        model_load_map=model_loader.MODEL_LOADER_MAP,
+        model=LanguageModels.SENTENCE_TRANSFORMER,
+    )
+    assert isinstance(model, SentenceTransformer)