started adding comprehensive unit tests

This commit is contained in:
Florian Förster
2024-11-13 17:54:47 +01:00
parent a0ca71ea87
commit 6781b4a132
32 changed files with 4042 additions and 1430 deletions

File diff suppressed because it is too large Load Diff

BIN
tests/analyse_dataset.xlsx Normal file

Binary file not shown.

View File

View File

@@ -0,0 +1,168 @@
import networkx as nx
import pytest
from lang_main.analysis import graphs
TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
def build_init_graph(token_graph: bool):
edge_weights = [
{'weight': 1},
{'weight': 2},
{'weight': 3},
{'weight': 4},
{'weight': 5},
{'weight': 6},
]
edges = [
(1, 2),
(1, 3),
(2, 4),
(3, 4),
(1, 4),
(2, 1),
]
edges_to_add = []
for i, edge in enumerate(edges):
edge = list(edge)
edge.append(edge_weights[i]) # type: ignore
edges_to_add.append(tuple(edge))
if token_graph:
G = graphs.TokenGraph(name=TK_GRAPH_NAME, enable_logging=False)
else:
G = nx.DiGraph()
G.add_edges_from(edges_to_add)
return G
@pytest.fixture(scope='module')
def graph():
return build_init_graph(token_graph=False)
@pytest.fixture(scope='module')
def tk_graph():
return build_init_graph(token_graph=True)
def test_graph_size(graph):
assert len(graph.nodes) == 4
assert len(graph.edges) == 6
def test_save_to_GraphML(graph, tmp_path):
filename = 'test_graphML'
graphs.save_to_GraphML(graph, saving_path=tmp_path, filename=filename)
saved_file = (tmp_path / filename).with_suffix('.graphml')
assert saved_file.exists()
def test_metadata_retrieval(graph):
metadata = graphs.get_graph_metadata(graph)
assert metadata['num_nodes'] == 4
assert metadata['num_edges'] == 6
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 6
assert metadata['node_memory'] == 112
assert metadata['edge_memory'] == 336
assert metadata['total_memory'] == 448
def test_graph_update_batch():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 6
assert metadata['num_edges'] == 8
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 8
def test_graph_update_single_new():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 5
assert metadata['num_edges'] == 7
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 7
def test_graph_update_single_existing():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 4
assert metadata['num_edges'] == 6
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 10
@pytest.mark.parametrize('cast_int', [True, False])
def test_graph_undirected_conversion(graph, cast_int):
graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
# edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
def test_graph_cytoscape_conversion(graph):
cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
node = cyto_graph[0]
edge = cyto_graph[-1]
assert node['data']['id'] == 1 # type: ignore
assert edge['data']['source'] == 3 # type: ignore
assert edge['data']['target'] == 4 # type: ignore
assert edge['data']['weight'] == 4 # type: ignore
assert weight_data['min'] == 1
assert weight_data['max'] == 6
def test_tk_graph_properties(tk_graph):
assert tk_graph.name == TK_GRAPH_NAME
assert isinstance(tk_graph.directed, graphs.TokenGraph)
assert isinstance(tk_graph.undirected, nx.Graph)
tk_graph.update_metadata()
metadata_directed = tk_graph.metadata_directed
assert metadata_directed['num_nodes'] == 4
assert metadata_directed['num_edges'] == 6
assert metadata_directed['min_edge_weight'] == 1
assert metadata_directed['max_edge_weight'] == 6
assert metadata_directed['node_memory'] == 112
assert metadata_directed['edge_memory'] == 336
assert metadata_directed['total_memory'] == 448
metadata_undirected = tk_graph.metadata_undirected
assert metadata_undirected['num_nodes'] == 4
assert metadata_undirected['num_edges'] == 5
assert metadata_undirected['min_edge_weight'] == 2
assert metadata_undirected['max_edge_weight'] == 7
assert metadata_undirected['node_memory'] == 112
assert metadata_undirected['edge_memory'] == 280
assert metadata_undirected['total_memory'] == 392
def test_graph_degree_filter(tk_graph):
filtered_graph = graphs.filter_graph_by_node_degree(
tk_graph,
bound_lower=3,
bound_upper=3,
)
assert len(filtered_graph.nodes) == 2
def test_graph_edge_number_filter(tk_graph):
number_edges_limit = 1
filtered_graph = graphs.filter_graph_by_number_edges(
tk_graph,
limit=number_edges_limit,
)
assert len(filtered_graph.edges) == number_edges_limit
filtered_graph = graphs.filter_graph_by_node_degree(
filtered_graph,
bound_lower=1,
bound_upper=None,
)
assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'

View File

@@ -0,0 +1,73 @@
"""testing each function in a consecutive way like each one is
executed in in a pipeline
"""
from lang_main.analysis import preprocessing as ppc
from lang_main.analysis import shared
def test_load_data(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
assert len(data) == 1000
def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
assert len(data) == 999
def test_remove_na(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
assert len(data) == 998
# def test_string_cleansing():
# string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
# cleaned_string = shared.clean_string_slim(string)
# target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
# assert cleaned_string == target_string
def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
cleaned_string = shared.clean_string_slim(starting_string)
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
assert cleaned_string == target_string
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
(data,) = shared.entry_wise_cleansing(
data,
target_features=target_features,
cleansing_func=shared.clean_string_slim,
)
assert data.at[0, 'VorgangsBeschreibung'] == target_string
def test_analyse_feature(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
cleaned_string = shared.clean_string_slim(starting_string)
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
assert cleaned_string == target_string
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
(data,) = shared.entry_wise_cleansing(
data,
target_features=target_features,
cleansing_func=shared.clean_string_slim,
)
assert data.at[0, 'VorgangsBeschreibung'] == target_string
(data,) = ppc.analyse_feature(data, target_feature=target_features[0])
assert len(data) == 139

23
tests/conftest.py Normal file
View File

@@ -0,0 +1,23 @@
from pathlib import Path
import pytest
DATE_COLS: tuple[str, ...] = (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
)
@pytest.fixture(scope='session')
def raw_data_path():
pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
assert pth_data.exists()
return pth_data
@pytest.fixture(scope='session')
def raw_data_date_cols():
return DATE_COLS

View File

@@ -1,56 +0,0 @@
# lang_main: Config file
[paths]
inputs = '../scripts/inputs/'
results = '../scripts/results/test_new2/'
dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

7
tests/test_config.py Normal file
View File

@@ -0,0 +1,7 @@
from lang_main import config, pkg_dir
def test_load_config():
toml_path = pkg_dir / 'lang_main_config.toml'
loaded_cfg = config.load_toml_config(toml_path)
assert loaded_cfg['info']['pkg'] == 'lang_main'

57
tests/test_io.py Normal file
View File

@@ -0,0 +1,57 @@
import pytest
from lang_main import io
CONTENT = 'test_lang_main'
@pytest.mark.parametrize(
'overwrite',
[True, False],
)
def test_create_saving_folder(tmp_path, overwrite):
target_dir = tmp_path / 'test'
assert not target_dir.exists()
io.create_saving_folder(target_dir, overwrite_existing=overwrite)
assert target_dir.exists()
assert target_dir.is_dir()
def test_save_load(tmp_path):
save_pth = tmp_path / 'test_lang_main.pkl'
io.save_pickle(CONTENT, save_pth)
loaded = io.load_pickle(save_pth)
assert loaded == CONTENT
b64_str = io.encode_to_base64_str(CONTENT)
b64_str_file = io.encode_file_to_base64_str(save_pth)
assert b64_str == b64_str_file
b64_decoded = io.decode_from_base64_str(b64_str)
assert b64_decoded == CONTENT
b64_decoded_file = io.decode_from_base64_str(b64_str_file)
assert b64_decoded_file == CONTENT
def test_get_entry_point(tmp_path):
save_pth = tmp_path / 'test_lang_main.pkl'
io.save_pickle(CONTENT, save_pth)
pth = io.get_entry_point(
tmp_path,
'test_lang_main',
'.pkl',
check_existence=True,
)
assert pth.exists()
with pytest.raises(FileNotFoundError):
_ = io.get_entry_point(
tmp_path,
'test_lang_main2',
'.pkl',
check_existence=True,
)
pth = io.get_entry_point(
tmp_path,
'test_lang_main2',
'.pkl',
check_existence=False,
)
assert not pth.exists()

View File

@@ -0,0 +1,5 @@
from lang_main import BASE_PATH
def test_base_path():
assert BASE_PATH is not None

113
tests/test_model_loader.py Normal file
View File

@@ -0,0 +1,113 @@
import pytest
from sentence_transformers import SentenceTransformer
from spacy.language import Language
from lang_main import model_loader
from lang_main.constants import (
STFR_MODEL_ARGS_ONNX,
SimilarityFunction,
SpacyModelTypes,
STFRBackends,
STFRDeviceTypes,
STFRModelTypes,
)
from lang_main.types import LanguageModels
@pytest.mark.parametrize(
'similarity_func',
[
SimilarityFunction.COSINE,
SimilarityFunction.DOT,
],
)
@pytest.mark.parametrize(
'model_name',
[
STFRModelTypes.ALL_DISTILROBERTA_V1,
STFRModelTypes.ALL_MINI_LM_L12_V2,
STFRModelTypes.ALL_MINI_LM_L6_V2,
STFRModelTypes.ALL_MPNET_BASE_V2,
],
)
@pytest.mark.mload
def test_load_sentence_transformer(
model_name,
similarity_func,
) -> None:
model = model_loader.load_sentence_transformer(
model_name=model_name,
similarity_func=similarity_func,
backend=STFRBackends.TORCH,
device=STFRDeviceTypes.CPU,
model_kwargs=None,
)
assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
'similarity_func',
[
SimilarityFunction.COSINE,
SimilarityFunction.DOT,
],
)
@pytest.mark.parametrize(
'model_name',
[
STFRModelTypes.ALL_DISTILROBERTA_V1,
STFRModelTypes.ALL_MINI_LM_L12_V2,
STFRModelTypes.ALL_MINI_LM_L6_V2,
STFRModelTypes.ALL_MPNET_BASE_V2,
],
)
@pytest.mark.mload
def test_load_sentence_transformer_onnx(
model_name,
similarity_func,
) -> None:
model = model_loader.load_sentence_transformer(
model_name=model_name,
similarity_func=similarity_func,
backend=STFRBackends.ONNX,
device=STFRDeviceTypes.CPU,
model_kwargs=STFR_MODEL_ARGS_ONNX, # type: ignore
)
assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
'model_name',
[
SpacyModelTypes.DE_CORE_NEWS_SM,
SpacyModelTypes.DE_CORE_NEWS_MD,
SpacyModelTypes.DE_CORE_NEWS_LG,
SpacyModelTypes.DE_DEP_NEWS_TRF,
],
)
@pytest.mark.mload
def test_load_spacy_model(
model_name,
):
model = model_loader.load_spacy(
model_name=model_name,
)
assert isinstance(model, Language)
@pytest.mark.mload
def test_instantiate_spacy_model():
model = model_loader.instantiate_model(
model_load_map=model_loader.MODEL_LOADER_MAP,
model=LanguageModels.SPACY,
)
assert isinstance(model, Language)
@pytest.mark.mload
def test_instantiate_stfr_model():
model = model_loader.instantiate_model(
model_load_map=model_loader.MODEL_LOADER_MAP,
model=LanguageModels.SENTENCE_TRANSFORMER,
)
assert isinstance(model, SentenceTransformer)