started adding comprehensive unit tests

This commit is contained in:
Florian Förster 2024-11-13 17:54:47 +01:00
parent a0ca71ea87
commit 6781b4a132
32 changed files with 4042 additions and 1430 deletions

View File

@ -2,28 +2,27 @@
[paths] [paths]
inputs = './inputs/' inputs = './inputs/'
results = './results/test_new2/' # results = './results/dummy_N_1000/'
dataset = './01_2_Rohdaten_neu/Export4.csv' # dataset = '../data/Dummy_Dataset_N_1000.csv'
#results = './results/Export7/' results = './results/test_20240807/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[logging]
enabled = true
stderr = true
file = true
# only debugging features, production-ready pipelines should always
# be fully executed
[control] [control]
preprocessing = true preprocessing_skip = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false graph_postprocessing_skip = false
time_analysis = false graph_rescaling_skip = false
time_analysis_skip = false graph_static_rendering_skip = false
time_analysis_skip = true
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess] [preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [ date_cols = [
"VorgangsDatum", "VorgangsDatum",
"ErledigungsDatum", "ErledigungsDatum",
@ -34,17 +33,25 @@ threshold_amount_characters = 5
threshold_similarity = 0.8 threshold_similarity = 0.8
[graph_postprocessing] [graph_postprocessing]
threshold_edge_weight = 150 threshold_edge_number = 330
# threshold_edge_weight = 150
[time_analysis.uniqueness] [time_analysis.uniqueness]
threshold_unique_texts = 4 threshold_unique_texts = 4
criterion_feature = 'HObjektText' criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID' feature_name_obj_id = 'ObjektID'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input] [time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [ input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung', 'VorgangsBeschreibung',
] ]
activity_feature = 'VorgangsTypName' activity_feature = 'VorgangsTypName'

View File

@ -1,58 +0,0 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './results/'
dataset = '../data/02_202307/Export4.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_number = 300
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -19,6 +19,900 @@
"outputs": [], "outputs": [],
"source": [] "source": []
}, },
{
"cell_type": "code",
"execution_count": 1,
"id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
"metadata": {},
"outputs": [],
"source": [
"edge_weighst = [\n",
" {'weight': 1},\n",
" {'weight': 2},\n",
" {'weight': 3},\n",
" {'weight': 4},\n",
" {'weight': 5},\n",
" {'weight': 6},\n",
"]\n",
"edges = [\n",
" (1, 2),\n",
" (1, 3),\n",
" (2, 4),\n",
" (3, 4),\n",
" (1, 4),\n",
" (2, 1),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
"metadata": {},
"outputs": [],
"source": [
"edges_to_add = []\n",
"for i, edge in enumerate(edges):\n",
" edge = list(edge)\n",
" edge.append(edge_weighst[i])\n",
" edges_to_add.append(tuple(edge))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 2, {'weight': 1}),\n",
" (1, 3, {'weight': 2}),\n",
" (2, 4, {'weight': 3}),\n",
" (3, 4, {'weight': 4}),\n",
" (1, 4, {'weight': 5}),\n",
" (2, 1, {'weight': 6})]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"edges_to_add"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
"metadata": {},
"outputs": [],
"source": [
"G = nx.DiGraph()\n",
"G.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "91d4094b-f886-4056-a697-5223f157f1d3",
"metadata": {},
"outputs": [],
"source": [
"tk = graphs.TokenGraph()\n",
"tk.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "518cada9-561a-4b96-b750-3d500d1d28b9",
"metadata": {},
"outputs": [],
"source": [
"from lang_main.analysis import graphs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 4,\n",
" 'num_edges': 6,\n",
" 'min_edge_weight': 1,\n",
" 'max_edge_weight': 6,\n",
" 'node_memory': 112,\n",
" 'edge_memory': 336,\n",
" 'total_memory': 448}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphs.get_graph_metadata(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca2ce8e8-d72a-4edf-ae42-0f79bd9d19a2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 38,
"id": "223dc592-fa56-4536-a5c2-a166001a6aca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4\n",
"1 0.0 1.0 2.0 5.0\n",
"2 6.0 0.0 0.0 3.0\n",
"3 0.0 0.0 0.0 4.0\n",
"4 0.0 0.0 0.0 0.0"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nx.to_pandas_adjacency(G)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "1f677da0-7416-413c-adb1-ae1384e09349",
"metadata": {},
"outputs": [],
"source": [
"G_undir = graphs.convert_graph_to_undirected(G, cast_int=False)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "356862fb-2383-43d9-80ba-4fe83646c9d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5.0</td>\n",
" <td>3.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4\n",
"1 0.0 7.0 2.0 5.0\n",
"2 7.0 0.0 0.0 3.0\n",
"3 2.0 0.0 0.0 4.0\n",
"4 5.0 3.0 4.0 0.0"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nx.to_pandas_adjacency(G_undir)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "b8a3db1a-0d2a-4635-ab88-7802e2cf59e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_undir.is_directed()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "46001528-75b0-4fe8-a3ec-353bbd3eeeff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'weight': 7.0}"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_undir[1][2]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "cf2dcdff-f0b7-416e-9db3-c7a21ea96b96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"([{'data': {'id': 1, 'label': 1}},\n",
" {'data': {'id': 2, 'label': 2}},\n",
" {'data': {'id': 3, 'label': 3}},\n",
" {'data': {'id': 4, 'label': 4}},\n",
" {'data': {'source': 1, 'target': 2, 'weight': 1}},\n",
" {'data': {'source': 1, 'target': 3, 'weight': 2}},\n",
" {'data': {'source': 1, 'target': 4, 'weight': 5}},\n",
" {'data': {'source': 2, 'target': 4, 'weight': 3}},\n",
" {'data': {'source': 2, 'target': 1, 'weight': 6}},\n",
" {'data': {'source': 3, 'target': 4, 'weight': 4}}],\n",
" {'min': 1, 'max': 6})"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphs.convert_graph_to_cytoscape(G)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f82481e9-873f-4657-80d3-ba75af74fa27",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 4, number of edges: 6)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk.update_metadata()\n",
"tk"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4b806620-b469-45ef-823b-db46f8590509",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(G.degree)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a41d019-1b6b-46f7-b13e-ac22da737940",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G.degree[1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e99f2fb4-4c8d-4564-810d-a4b2ed9d6009",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(tk.degree)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1368ebf6-e008-492d-8d15-fe3ed12b78a3",
"metadata": {},
"outputs": [],
"source": [
"g_filt = graphs.filter_graph_by_node_degree(\n",
" tk,\n",
" bound_lower=3,\n",
" bound_upper=3,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "de69f73e-da1d-4479-81da-006f2ce61844",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 2, number of edges: 1)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g_filt"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b12fd64d-737e-4c68-94ea-72a817647a04",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2, 4]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(g_filt.nodes)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "21434c7c-887c-4f9f-884a-48514e2279e0",
"metadata": {},
"outputs": [],
"source": [
"G = nx.DiGraph()\n",
"G.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2defef69-f09a-4869-984a-27b6373b17b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'weight': 1}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G[1][2]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "0308a2ac-f554-4e24-9ddb-578dd588f3c8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(G.edges)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15d9ce65-f9a5-40de-a737-098579f6a8ee",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7acf4be7-45f3-45e6-87f5-14343f23d610",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9139812b-74ba-45ce-adfc-e57667259692",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loaded TOML config file successfully.\n"
]
}
],
"source": [
"from lang_main import search_iterative, search_base_path\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "35118922-3a17-4698-93bc-5292a276a4b4",
"metadata": {},
"outputs": [],
"source": [
"from lang_main import constants"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "341299bf-e926-4e55-8545-8805a186f49c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/lang-models')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"constants.MODEL_BASE_FOLDER"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "11ce4062-b229-4d88-967d-6eeb6d0135b7",
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "40dac543-1e53-4fd8-a192-88f3527872b2",
"metadata": {},
"outputs": [],
"source": [
"model_kwargs = {\n",
" 'file_name': 'onnx/model_quint8_avx2.onnx',\n",
" 'provider': 'CPUExecutionProvider',\n",
" 'export': False,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8a0eaa9f-e2d2-4106-b80b-80916e9d8bfe",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "215b46e3607e4530b2d8f8227367ef23",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model_quint8_avx2.onnx: 0%| | 0.00/23.0M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in A:\\Arbeitsaufgaben\\lang-models\\models--sentence-transformers--all-MiniLM-L6-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n"
]
}
],
"source": [
"stfr = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine', backend='onnx', model_kwargs=model_kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cd921aca-0673-41ec-98a3-18e360a39a41",
"metadata": {},
"outputs": [],
"source": [
"from lang_main.constants import SPACY_MODEL_NAME"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "88173a68-7d8e-4f4c-a4ad-bbf78efaf781",
"metadata": {},
"outputs": [],
"source": [
"import importlib"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e5293976-22ab-406a-ba32-066fd7254394",
"metadata": {},
"outputs": [],
"source": [
"mod = importlib.import_module(SPACY_MODEL_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6023a339-02da-429c-acf5-f14a56989357",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<module 'de_dep_news_trf' from 'A:\\\\Arbeitsaufgaben\\\\lang-main\\\\.venv\\\\Lib\\\\site-packages\\\\de_dep_news_trf\\\\__init__.py'>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mod"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5f4fa066-fa0f-4818-9cf9-ec28923150ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loaded TOML config file successfully.\n"
]
}
],
"source": [
"from lang_main.analysis.shared import clean_string_slim"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "71286836-7eb2-4095-ab82-42d7ac7ed476",
"metadata": {},
"outputs": [],
"source": [
"string = 'Ölleckage durch\\nundichten \\t Ölsumpf,, aber Dichtung intakt??!!!'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e8284e76-e750-458e-bb63-d59d6d57a396",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ölleckage durch\n",
"undichten \t Ölsumpf,, aber Dichtung intakt??!!!\n"
]
}
],
"source": [
"print(string)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "82e98d8f-2e24-42f9-a3ed-3b3454ae64f4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_string_slim(string)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b527b145-15d2-4961-b441-1843fe9f5c29",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "49c2e2f0-1e6d-4969-b583-8fc15b8930f9",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "16ae5d5c-a0a7-400b-8e38-231c72ad27b5",
"metadata": {},
"outputs": [],
"source": [
"pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "3b9fe636-f895-404a-819d-61198d34262d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Am war ich essen. Am hingegen nicht. Und war ich allein.'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string = 'Am 11.02.2024 war ich essen. Am 11.12. hingegen nicht. Und 2024 war ich allein.'\n",
"string = pattern_dates.sub('', string)\n",
"string"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c49ab3c-e860-42af-ac0c-2f44f075e846",
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,

2079
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -15,6 +15,7 @@ dependencies = [
"typing-extensions>=4.12.2", "typing-extensions>=4.12.2",
"tqdm>=4.67.0", "tqdm>=4.67.0",
"python-dateutil>=2.9.0.post0", "python-dateutil>=2.9.0.post0",
"onnx==1.16.1",
] ]
requires-python = ">=3.11" requires-python = ">=3.11"
readme = "README.md" readme = "README.md"
@ -33,6 +34,18 @@ plot = [
cytoscape = [ cytoscape = [
"py4cytoscape>=1.11.0", "py4cytoscape>=1.11.0",
] ]
spacy-trf = [
"de-dep-news-trf @ https://github.com/explosion/spacy-models/releases/download/de_dep_news_trf-3.8.0/de_dep_news_trf-3.8.0-py3-none-any.whl",
]
spacy-sm = [
"de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl",
]
spacy-md = [
"de-core-news-md @ https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.8.0/de_core_news_md-3.8.0-py3-none-any.whl",
]
spacy-lg = [
"de-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0-py3-none-any.whl",
]
[build-system] [build-system]
requires = ["pdm-backend"] requires = ["pdm-backend"]
build-backend = "pdm.backend" build-backend = "pdm.backend"
@ -57,6 +70,8 @@ dev = [
"cython>=3.0.10", "cython>=3.0.10",
"openpyxl>=3.1.5", "openpyxl>=3.1.5",
"seaborn>=0.13.2", "seaborn>=0.13.2",
"pytest>=8.3.3",
"pytest-cov>=6.0.0",
] ]
[tool.ruff] [tool.ruff]
@ -74,3 +89,36 @@ select = ["E", "F", "I"]
[tool.ruff.lint.isort] [tool.ruff.lint.isort]
extra-standard-library = ["typing_extensions"] extra-standard-library = ["typing_extensions"]
[tool.pytest.ini_options]
addopts = [
"-vvl",
"--import-mode=importlib",
]
testpaths = [
"tests",
]
filterwarnings = [
'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
]
markers = [
"mload: marks tests with loading of language models (deselect with '-m \"not mload\"')",
]
log_cli = true
[tool.coverage.run]
relative_files = true
source = [
"lang_main",
"tests/",
]
[tool.coverage.report]
exclude_also = [
"def __repr__",
"def __str__",
"@overload",
]
[tool.coverage.html]
directory = "reports/coverage"

1
python/README.txt Normal file
View File

@ -0,0 +1 @@
only used to simulate directory tree in final solution

View File

@ -1,51 +0,0 @@
import inspect
import logging
import shutil
import sys
from pathlib import Path
from time import gmtime
from typing import Any, Final
import warnings
from lang_main.io import load_toml_config
__all__ = [
'CALLER_PATH',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
format=LOG_FMT,
datefmt=LOG_DATE_FMT,
)
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = True
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
caller_file = Path(inspect.stack()[-1].filename)
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
# load config data: internal/external
if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
if not caller_file.exists():
warnings.warn('Caller file could not be correctly retrieved.')
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(
(
'No config file was found. A new one with default values was created '
'in the execution path. Please fill in the necessary values and '
'restart the programm.'
)
)
# raise NotImplementedError("External config data not implemented yet.")
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()

View File

@ -1,14 +1,19 @@
import logging import logging
import os
from pathlib import Path from pathlib import Path
from typing import Any, Final from typing import Any, Final
from lang_main.config import load_toml_config
_has_py4cyto: bool = True _has_py4cyto: bool = True
try: try:
import py4cytoscape as p4c import py4cytoscape as p4c
except ImportError: except ImportError:
_has_py4cyto = False _has_py4cyto = False
from lang_main.io import load_toml_config # ** external packages config
# ** Huggingface Hub caching
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
# ** py4cytoscape config # ** py4cytoscape config
if _has_py4cyto: if _has_py4cyto:
@ -20,6 +25,7 @@ if _has_py4cyto:
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler()) p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
# ** lang-main config # ** lang-main config
BASE_FOLDERNAME: Final[str] = 'lang-main'
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml' CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
PREFER_INTERNAL_CONFIG: Final[bool] = False PREFER_INTERNAL_CONFIG: Final[bool] = False
@ -75,27 +81,71 @@ def search_iterative(
pattern to look for, first match will be returned, pattern to look for, first match will be returned,
by default CONFIG_FILENAME by default CONFIG_FILENAME
stop_folder_name : str, optional stop_folder_name : str, optional
name of the last folder in the directory tree to search, by default 'python' name of the last folder in the directory tree to search, by default None
Returns Returns
------- -------
Path | None Path | None
Path if corresponding object was found, None otherwise Path if corresponding object was found, None otherwise
""" """
cfg_path: Path | None = None file_path: Path | None = None
stop_folder_reached: bool = False
for it in range(len(starting_path.parents)): for it in range(len(starting_path.parents)):
search_path = starting_path.parents[it] # do not look in library folder search_path = starting_path.parents[it] # do not look in library folder
res = tuple(search_path.glob(glob_pattern)) res = tuple(search_path.glob(glob_pattern))
if res: if res:
cfg_path = res[0] file_path = res[0]
break
elif stop_folder_reached:
break break
if stop_folder_name is not None and search_path.name == stop_folder_name:
# library is placed inside a whole python installation for deployment
# if this folder is reached, only look up one parent above
stop_folder_reached = True
return file_path
def search_base_path(
starting_path: Path,
stop_folder_name: str | None = None,
) -> Path | None:
"""Iteratively searches the parent directories of the starting path
and look for folders matching the given name. If a match is encountered,
the parent path will be returned.
Example:
starting_path = path/to/start/folder
stop_folder_name = 'to'
returned path = 'path/'
Parameters
----------
starting_path : Path
non-inclusive starting path
stop_folder_name : str, optional
name of the last folder in the directory tree to search, by default None
Returns
-------
Path | None
Path if corresponding base path was found, None otherwise
"""
stop_folder_path: Path | None = None
base_path: Path | None = None
for it in range(len(starting_path.parents)):
search_path = starting_path.parents[it] # do not look in library folder
if stop_folder_name is not None and search_path.name == stop_folder_name: if stop_folder_name is not None and search_path.name == stop_folder_name:
# library is placed inside a whole python installation for deployment # library is placed inside a whole python installation for deployment
# only look up to this folder # only look up to this folder
stop_folder_path = search_path
break break
return cfg_path if stop_folder_path is not None:
base_path = stop_folder_path.parent
return base_path
def load_cfg() -> dict[str, Any]: def load_cfg() -> dict[str, Any]:
@ -121,6 +171,10 @@ def load_cfg() -> dict[str, Any]:
CONFIG: Final[dict[str, Any]] = load_cfg() CONFIG: Final[dict[str, Any]] = load_cfg()
base_parent_path = search_base_path(pkg_dir, stop_folder_name=BASE_FOLDERNAME)
if base_parent_path is None:
raise FileNotFoundError('Could not resolve base path of library')
BASE_PATH: Final[Path] = base_parent_path
# ** Cytoscape configuration # ** Cytoscape configuration

View File

@ -48,9 +48,9 @@ def save_to_GraphML(
def get_graph_metadata( def get_graph_metadata(
graph: Graph | DiGraph, graph: Graph | DiGraph,
logging: bool = LOGGING_DEFAULT_GRAPHS, logging: bool = LOGGING_DEFAULT_GRAPHS,
) -> dict[str, int]: ) -> dict[str, float]:
# info about graph # info about graph
graph_info: dict[str, int] = {} graph_info: dict[str, float] = {}
# nodes and edges # nodes and edges
num_nodes = len(graph.nodes) num_nodes = len(graph.nodes)
num_edges = len(graph.edges) num_edges = len(graph.edges)
@ -96,15 +96,6 @@ def update_graph(
child: Hashable | None = None, child: Hashable | None = None,
weight_connection: int | None = None, weight_connection: int | None = None,
) -> None: ) -> None:
# !! not necessary to check for existence of nodes
# !! feature already implemented in NetworkX ``add_edge``
"""
# check if nodes already in Graph
if parent not in graph:
graph.add_node(parent)
if child not in graph:
graph.add_node(child)
"""
if weight_connection is None: if weight_connection is None:
weight_connection = 1 weight_connection = 1
# check if edge not in Graph # check if edge not in Graph
@ -115,9 +106,7 @@ def update_graph(
graph.add_edge(parent, child, weight=weight_connection) graph.add_edge(parent, child, weight=weight_connection)
else: else:
# update edge # update edge
weight = graph[parent][child]['weight'] graph[parent][child]['weight'] += weight_connection
weight += weight_connection
graph[parent][child]['weight'] = weight
# build undirected adjacency matrix # build undirected adjacency matrix
@ -249,7 +238,8 @@ def filter_graph_by_node_degree(
bound_lower: int | None, bound_lower: int | None,
bound_upper: int | None, bound_upper: int | None,
) -> TokenGraph: ) -> TokenGraph:
"""filters all nodes which are within the provided bounds by their degree """filters all nodes which are within the provided bounds by their degree,
inclusive limits: bound_lower <= node_degree <= bound_upper are retained
Parameters Parameters
---------- ----------
@ -266,13 +256,14 @@ def filter_graph_by_node_degree(
# filter nodes by degree # filter nodes by degree
original_graph_nodes = copy.deepcopy(graph.nodes) original_graph_nodes = copy.deepcopy(graph.nodes)
filtered_graph = graph.copy() filtered_graph = graph.copy()
filtered_graph_degree = copy.deepcopy(filtered_graph.degree)
if not any([bound_lower, bound_upper]): if not any([bound_lower, bound_upper]):
logger.warning('No bounds provided, returning original graph.') logger.warning('No bounds provided, returning original graph.')
return filtered_graph return filtered_graph
for node in original_graph_nodes: for node in original_graph_nodes:
degree = filtered_graph.degree[node] # type: ignore degree = cast(int, filtered_graph_degree[node]) # type: ignore
if bound_lower is not None and degree < bound_lower: if bound_lower is not None and degree < bound_lower:
filtered_graph.remove_node(node) filtered_graph.remove_node(node)
if bound_upper is not None and degree > bound_upper: if bound_upper is not None and degree > bound_upper:
@ -540,9 +531,9 @@ class TokenGraph(DiGraph):
self._name = name self._name = name
# directed and undirected graph data # directed and undirected graph data
self._directed = self self._directed = self
self._metadata_directed: dict[str, int] = {} self._metadata_directed: dict[str, float] = {}
self._undirected: Graph | None = None self._undirected: Graph | None = None
self._metadata_undirected: dict[str, int] = {} self._metadata_undirected: dict[str, float] = {}
# indicate rescaled weights # indicate rescaled weights
self.rescaled_weights: bool = False self.rescaled_weights: bool = False
@ -568,12 +559,12 @@ class TokenGraph(DiGraph):
return hash(self.__key()) return hash(self.__key())
""" """
def copy(self) -> Self: def copy(self) -> TokenGraph:
"""returns a (deep) copy of the graph """returns a (deep) copy of the graph
Returns Returns
------- -------
Self TokenGraph
deep copy of the graph deep copy of the graph
""" """
return copy.deepcopy(self) return copy.deepcopy(self)
@ -594,11 +585,11 @@ class TokenGraph(DiGraph):
return self._undirected return self._undirected
@property @property
def metadata_directed(self) -> dict[str, int]: def metadata_directed(self) -> dict[str, float]:
return self._metadata_directed return self._metadata_directed
@property @property
def metadata_undirected(self) -> dict[str, int]: def metadata_undirected(self) -> dict[str, float]:
return self._metadata_undirected return self._metadata_undirected
@overload @overload

View File

@ -30,7 +30,7 @@ if TYPE_CHECKING:
# ** (1) dataset preparation: loading and simple preprocessing # ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple # following functions are used to load a given dataset and perform simple
# duplicate cleansing based on all properties # duplicate cleansing based on all properties
def load_raw_data( def load_raw_data(
path: Path, path: Path,
@ -277,41 +277,41 @@ def merge_similarity_dupl(
# ** ################################################################################# # ** #################################################################################
# TODO check removal # TODO check removal
def build_embedding_map( # def build_embedding_map(
data: Series, # data: Series,
model: GermanSpacyModel | SentenceTransformer, # model: GermanSpacyModel | SentenceTransformer,
) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]: # ) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
# dictionary with embeddings # # dictionary with embeddings
embeddings: dict[int, tuple[Embedding, str]] = {} # embeddings: dict[int, tuple[Embedding, str]] = {}
is_spacy = False # is_spacy = False
is_STRF = False # is_STRF = False
if isinstance(model, GermanSpacyModel): # if isinstance(model, GermanSpacyModel):
is_spacy = True # is_spacy = True
elif isinstance(model, SentenceTransformer): # elif isinstance(model, SentenceTransformer):
is_STRF = True # is_STRF = True
if not any((is_spacy, is_STRF)): # if not any((is_spacy, is_STRF)):
raise NotImplementedError('Model type unknown') # raise NotImplementedError('Model type unknown')
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0): # for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
# verbose code: Pyright not inferring types correctly # # verbose code: Pyright not inferring types correctly
idx = cast(int, idx) # idx = cast(int, idx)
text = cast(str, text) # text = cast(str, text)
if is_spacy: # if is_spacy:
model = cast(GermanSpacyModel, model) # model = cast(GermanSpacyModel, model)
embd = cast(SpacyDoc, model(text)) # embd = cast(SpacyDoc, model(text))
embeddings[idx] = (embd, text) # embeddings[idx] = (embd, text)
# check for empty vectors # # check for empty vectors
if not embd.vector_norm: # if not embd.vector_norm:
logger.debug('--- Unknown Words ---') # logger.debug('--- Unknown Words ---')
logger.debug('embd.text: %s has no vector', embd.text) # logger.debug('embd.text: %s has no vector', embd.text)
elif is_STRF: # elif is_STRF:
model = cast(SentenceTransformer, model) # model = cast(SentenceTransformer, model)
embd = cast(Tensor, model.encode(text, show_progress_bar=False)) # embd = cast(Tensor, model.encode(text, show_progress_bar=False))
embeddings[idx] = (embd, text) # embeddings[idx] = (embd, text)
return embeddings, (is_spacy, is_STRF) # return embeddings, (is_spacy, is_STRF)
# adapt interface # adapt interface
@ -320,276 +320,275 @@ def build_embedding_map(
# build similarity matrix out of embeddings # build similarity matrix out of embeddings
def build_cosSim_matrix( # def build_cosSim_matrix(
data: Series, # data: Series,
model: GermanSpacyModel | SentenceTransformer, # model: GermanSpacyModel | SentenceTransformer,
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]: # ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
# build empty matrix # # build empty matrix
df_index = data.index # df_index = data.index
cosineSim_idx_matrix = pd.DataFrame( # cosineSim_idx_matrix = pd.DataFrame(
data=0.0, columns=df_index, index=df_index, dtype=np.float32 # data=0.0, columns=df_index, index=df_index, dtype=np.float32
) # )
logger.info('Start building embedding map...') # logger.info('Start building embedding map...')
# obtain embeddings based on used model # # obtain embeddings based on used model
embds, (is_spacy, is_STRF) = build_embedding_map( # embds, (is_spacy, is_STRF) = build_embedding_map(
data=data, # data=data,
model=model, # model=model,
) # )
logger.info('Embedding map built successfully.') # logger.info('Embedding map built successfully.')
# apply index based mapping for efficient handling of large texts # # apply index based mapping for efficient handling of large texts
combs = combinations(df_index, 2) # combs = combinations(df_index, 2)
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2) # total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
logger.info('Start calculation of similarity scores...') # logger.info('Start calculation of similarity scores...')
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0): # for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
# print(f"{idx1=}, {idx2=}") # # print(f"{idx1=}, {idx2=}")
embd1 = embds[idx1][0] # embd1 = embds[idx1][0]
embd2 = embds[idx2][0] # embd2 = embds[idx2][0]
# calculate similarity based on model type # # calculate similarity based on model type
if is_spacy: # if is_spacy:
embd1 = cast(SpacyDoc, embds[idx1][0]) # embd1 = cast(SpacyDoc, embds[idx1][0])
embd2 = cast(SpacyDoc, embds[idx2][0]) # embd2 = cast(SpacyDoc, embds[idx2][0])
cosSim = embd1.similarity(embd2) # cosSim = embd1.similarity(embd2)
elif is_STRF: # elif is_STRF:
embd1 = cast(Tensor, embds[idx1][0]) # embd1 = cast(Tensor, embds[idx1][0])
embd2 = cast(Tensor, embds[idx2][0]) # embd2 = cast(Tensor, embds[idx2][0])
cosSim = sentence_transformers.util.cos_sim(embd1, embd2) # cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
cosSim = cast(float, cosSim.item()) # cosSim = cast(float, cosSim.item())
cosineSim_idx_matrix.at[idx1, idx2] = cosSim # cosineSim_idx_matrix.at[idx1, idx2] = cosSim
logger.info('Similarity scores calculated successfully.') # logger.info('Similarity scores calculated successfully.')
return cosineSim_idx_matrix, embds # return cosineSim_idx_matrix, embds
# obtain index pairs with cosine similarity # obtain index pairs with cosine similarity
# greater than or equal to given threshold value # greater than or equal to given threshold value
def filt_thresh_cosSim_matrix( # def filt_thresh_cosSim_matrix(
cosineSim_idx_matrix: DataFrame, # cosineSim_idx_matrix: DataFrame,
embds: dict[int, tuple[Embedding, str]], # embds: dict[int, tuple[Embedding, str]],
threshold: float, # threshold: float,
) -> tuple[Series, dict[int, tuple[Embedding, str]]]: # ) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
"""filter similarity matrix by threshold value and return index pairs with # """filter similarity matrix by threshold value and return index pairs with
a similarity score greater than the provided threshold # a similarity score greater than the provided threshold
Parameters # Parameters
---------- # ----------
threshold : float # threshold : float
similarity threshold # similarity threshold
cosineSim_idx_matrix : DataFrame # cosineSim_idx_matrix : DataFrame
similarity matrix # similarity matrix
Returns # Returns
------- # -------
Series # Series
series with multi index (index pairs) and corresponding similarity score # series with multi index (index pairs) and corresponding similarity score
""" # """
cosineSim_filt = cast( # cosineSim_filt = cast(
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack() # Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
) # )
return cosineSim_filt, embds # return cosineSim_filt, embds
def list_cosSim_dupl_candidates( # def list_cosSim_dupl_candidates(
cosineSim_filt: Series, # cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]], # embds: dict[int, tuple[Embedding, str]],
save_candidates: bool = False, # save_candidates: bool = False,
saving_path: Path | None = None, # saving_path: Path | None = None,
filename: str = 'CosSim-FilterCandidates', # filename: str = 'CosSim-FilterCandidates',
pipeline: Pipeline | None = None, # pipeline: Pipeline | None = None,
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]: # ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
"""providing an overview of candidates with a similarity score greater than # """providing an overview of candidates with a similarity score greater than
given threshold; more suitable for debugging purposes # given threshold; more suitable for debugging purposes
Returns # Returns
------- # -------
DataFrame # DataFrame
contains indices, corresponding texts and similarity score to evaluate results # contains indices, corresponding texts and similarity score to evaluate results
list[tuple[Index, Index]] # list[tuple[Index, Index]]
list containing relevant index pairs for entries with similarity score greater than # list containing relevant index pairs for entries with similarity score greater than
given threshold # given threshold
""" # """
logger.info('Start gathering of similarity candidates...') # logger.info('Start gathering of similarity candidates...')
# compare found duplicates # # compare found duplicates
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score'] # columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns) # df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] # index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore # for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
# get text content from embedding as second tuple entry # # get text content from embedding as second tuple entry
content = [ # content = [
[ # [
idx1, # idx1,
embds[idx1][1], # embds[idx1][1],
idx2, # idx2,
embds[idx2][1], # embds[idx2][1],
score, # score,
] # ]
] # ]
# add candidates to collection DataFrame # # add candidates to collection DataFrame
df_conc = pd.DataFrame(columns=columns, data=content) # df_conc = pd.DataFrame(columns=columns, data=content)
if df_candidates.empty: # if df_candidates.empty:
df_candidates = df_conc.copy() # df_candidates = df_conc.copy()
else: # else:
df_candidates = pd.concat([df_candidates, df_conc]) # df_candidates = pd.concat([df_candidates, df_conc])
# save index pairs # # save index pairs
index_pairs.append((idx1, idx2)) # index_pairs.append((idx1, idx2))
logger.info('Similarity candidates gathered successfully.') # logger.info('Similarity candidates gathered successfully.')
if save_candidates: # if save_candidates:
if saving_path is None: # if saving_path is None:
raise ValueError( # raise ValueError(
('Saving path must be provided if duplicate ' 'candidates should be saved.') # ('Saving path must be provided if duplicate ' 'candidates should be saved.')
) # )
elif pipeline is not None: # elif pipeline is not None:
target_filename = ( # target_filename = (
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx' # f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
) # )
elif pipeline is None: # elif pipeline is None:
target_filename = f'{filename}.xlsx' # target_filename = f'{filename}.xlsx'
logger.info('Saving similarity candidates...') # logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename) # target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path) # df_candidates.to_excel(target_path)
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path) # logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
return index_pairs, embds # return index_pairs, embds
# TODO: change implementation fully to SentenceTransformer # TODO: change implementation fully to SentenceTransformer
# usage of batch processing for embeddings, use candidate idx function # usage of batch processing for embeddings, use candidate idx function
# from time analysis --> moved to ``helpers.py`` # from time analysis --> moved to ``helpers.py``
"""
def similar_ids_connection_graph(
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[Graph, dict[str, int]]:
# build index graph to obtain graph of connected (similar) indices
# use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset
similar_id_graph = nx.Graph()
for (idx1, idx2) in similar_idx_pairs:
# inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True) # def similar_ids_connection_graph(
# similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
# ) -> tuple[Graph, dict[str, int]]:
# # build index graph to obtain graph of connected (similar) indices
# # use this graph to get connected components (indices which belong together)
# # retain semantic connection on whole dataset
# similar_id_graph = nx.Graph()
# for (idx1, idx2) in similar_idx_pairs:
# # inplace operation, parent/child do not really exist in undirected graph
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
return similar_id_graph, graph_info # graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
def similar_ids_groups( # return similar_id_graph, graph_info
dupl_id_graph: Graph,
) -> Iterator[list[PandasIndex]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]],
nx.connected_components(G=dupl_id_graph))
for id_group in ids_groups: # def similar_ids_groups(
yield list(id_group) # dupl_id_graph: Graph,
""" # ) -> Iterator[list[PandasIndex]]:
# # groups of connected indices
# ids_groups = cast(Iterator[set[PandasIndex]],
# nx.connected_components(G=dupl_id_graph))
# for id_group in ids_groups:
# yield list(id_group)
# merge duplicates # # merge duplicates
def merge_similarity_dupl_old( # def merge_similarity_dupl_old(
data: DataFrame, # data: DataFrame,
dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]], # dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[DataFrame]: # ) -> tuple[DataFrame]:
# copy pre-cleaned data # # copy pre-cleaned data
temp = data.copy() # temp = data.copy()
index = temp.index # index = temp.index
# logger.info("Start merging of similarity candidates...") # # logger.info("Start merging of similarity candidates...")
# iterate over index pairs # # iterate over index pairs
for i1, i2 in tqdm(dupl_idx_pairs): # for i1, i2 in tqdm(dupl_idx_pairs):
# if an entry does not exist any more, skip this pair # # if an entry does not exist any more, skip this pair
if i1 not in index or i2 not in index: # if i1 not in index or i2 not in index:
continue # continue
# merge num occur # # merge num occur
num_occur1 = temp.at[i1, 'num_occur'] # num_occur1 = temp.at[i1, 'num_occur']
num_occur2 = temp.at[i2, 'num_occur'] # num_occur2 = temp.at[i2, 'num_occur']
new_num_occur = num_occur1 + num_occur2 # new_num_occur = num_occur1 + num_occur2
# merge associated object ids # # merge associated object ids
assoc_ids1 = temp.at[i1, 'assoc_obj_ids'] # assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
assoc_ids2 = temp.at[i2, 'assoc_obj_ids'] # assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
new_assoc_ids = np.append(assoc_ids1, assoc_ids2) # new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
new_assoc_ids = np.unique(new_assoc_ids.flatten()) # new_assoc_ids = np.unique(new_assoc_ids.flatten())
# recalculate num associated obj ids # # recalculate num associated obj ids
new_num_assoc_obj_ids = len(new_assoc_ids) # new_num_assoc_obj_ids = len(new_assoc_ids)
# write properties to first entry # # write properties to first entry
temp.at[i1, 'num_occur'] = new_num_occur # temp.at[i1, 'num_occur'] = new_num_occur
temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids # temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids # temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
# drop second entry # # drop second entry
temp = temp.drop(index=i2) # temp = temp.drop(index=i2)
index = temp.index # index = temp.index
# logger.info("Similarity candidates merged successfully.") # # logger.info("Similarity candidates merged successfully.")
return (temp,) # return (temp,)
# ** debugging and evaluation # ** debugging and evaluation
def choose_cosSim_dupl_candidates( # def choose_cosSim_dupl_candidates(
cosineSim_filt: Series, # cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]], # embds: dict[int, tuple[Embedding, str]],
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]: # ) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
"""providing an overview of candidates with a similarity score greater than # """providing an overview of candidates with a similarity score greater than
given threshold, but decision is made manually by iterating through the candidates # given threshold, but decision is made manually by iterating through the candidates
with user interaction; more suitable for debugging purposes # with user interaction; more suitable for debugging purposes
Returns # Returns
------- # -------
DataFrame # DataFrame
contains indices, corresponding texts and similarity score to evaluate results # contains indices, corresponding texts and similarity score to evaluate results
list[tuple[Index, Index]] # list[tuple[Index, Index]]
list containing relevant index pairs for entries with similarity score greater than # list containing relevant index pairs for entries with similarity score greater than
given threshold # given threshold
""" # """
# compare found duplicates # # compare found duplicates
columns = ['idx1', 'text1', 'idx2', 'text2', 'score'] # columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns) # df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] # index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore # for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
# get texts for comparison # # get texts for comparison
text1 = embds[idx1][1] # text1 = embds[idx1][1]
text2 = embds[idx2][1] # text2 = embds[idx2][1]
# get decision # # get decision
print('---------- New Decision ----------') # print('---------- New Decision ----------')
print('text1:\n', text1, '\n', flush=True) # print('text1:\n', text1, '\n', flush=True)
print('text2:\n', text2, '\n', flush=True) # print('text2:\n', text2, '\n', flush=True)
decision = input('Please enter >>y<< if this is a duplicate, else hit enter:') # decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
if not decision == 'y': # if not decision == 'y':
continue # continue
# get text content from embedding as second tuple entry # # get text content from embedding as second tuple entry
content = [ # content = [
[ # [
idx1, # idx1,
text1, # text1,
idx2, # idx2,
text2, # text2,
score, # score,
] # ]
] # ]
df_conc = pd.DataFrame(columns=columns, data=content) # df_conc = pd.DataFrame(columns=columns, data=content)
df_candidates = pd.concat([df_candidates, df_conc]) # df_candidates = pd.concat([df_candidates, df_conc])
index_pairs.append((idx1, idx2)) # index_pairs.append((idx1, idx2))
return df_candidates, index_pairs # return df_candidates, index_pairs

View File

@ -22,7 +22,7 @@ pattern_escape_newline = re.compile(r'[\n]+')
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+') pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+') pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])') pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?') pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)?([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}') pattern_whitespace = re.compile(r'[ ]{2,}')
@ -43,7 +43,7 @@ def clean_string_slim(string: str) -> str:
cleaned entry cleaned entry
""" """
# remove special chars # remove special chars
string = pattern_escape_newline.sub('. ', string) # string = pattern_escape_newline.sub(' ', string)
string = pattern_escape_seq.sub(' ', string) string = pattern_escape_seq.sub(' ', string)
string = pattern_repeated_chars.sub('', string) string = pattern_repeated_chars.sub('', string)
# string = pattern_dates.sub('', string) # string = pattern_dates.sub('', string)
@ -127,7 +127,7 @@ def candidates_by_index(
def similar_index_connection_graph( def similar_index_connection_graph(
similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]], similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
) -> tuple[Graph, dict[str, int]]: ) -> tuple[Graph, dict[str, float]]:
# build index graph to obtain graph of connected (similar) indices # build index graph to obtain graph of connected (similar) indices
# use this graph to get connected components (indices which belong together) # use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset # retain semantic connection on whole dataset

17
src/lang_main/config.py Normal file
View File

@ -0,0 +1,17 @@
from __future__ import annotations
import sys
import tomllib
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from pathlib import Path
def load_toml_config(
path_to_toml: str | Path,
) -> dict[str, Any]:
with open(path_to_toml, 'rb') as f:
data = tomllib.load(f)
print('Loaded TOML config file successfully.', file=sys.stderr, flush=True)
return data

View File

@ -2,22 +2,21 @@ from enum import Enum # noqa: I001
from importlib.util import find_spec from importlib.util import find_spec
from pathlib import Path from pathlib import Path
from typing import Final from typing import Final
import os
from sentence_transformers import SimilarityFunction from sentence_transformers import SimilarityFunction
from lang_main import CONFIG, CYTO_PATH_STYLESHEET from lang_main import CONFIG, CYTO_PATH_STYLESHEET, BASE_PATH
from lang_main import model_loader as m_load
from lang_main.types import ( from lang_main.types import (
CytoLayoutProperties, CytoLayoutProperties,
CytoLayouts, CytoLayouts,
LanguageModels,
ModelLoaderMap,
ONNXExecutionProvider, # noqa: F401 ONNXExecutionProvider, # noqa: F401
STFRBackends, STFRBackends,
STFRDeviceTypes, STFRDeviceTypes,
STFRModelArgs, STFRModelArgs,
STFRModels, STFRModelTypes,
STFRQuantFilenames, # noqa: F401 STFRQuantFilenames, # noqa: F401
SpacyModelTypes,
) )
__all__ = [ __all__ = [
@ -67,35 +66,29 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** models # ** models
# ** loading # ** loading
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf' MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models'
STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2 MODEL_BASE_FOLDER: Final[Path] = BASE_PATH / MODEL_BASE_FOLDER_NAME
if not MODEL_BASE_FOLDER.exists():
raise FileNotFoundError('Language model folder not found.')
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_DEP_NEWS_TRF
STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
STFR_MODEL_ARGS: Final[STFRModelArgs] = {} STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
# STFR_MODEL_ARGS: Final[STFRModelArgs] = { STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
# 'file_name': STFRQuantFilenames.ONNX_Q_UINT8, 'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
# 'provider': ONNXExecutionProvider.CPU, 'provider': ONNXExecutionProvider.CPU,
# 'export': False, 'export': False,
# }
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
LanguageModels.SENTENCE_TRANSFORMER: {
'func': m_load.load_sentence_transformer,
'kwargs': {
'model_name': STFR_MODEL_NAME,
'similarity_func': STFR_SIMILARITY,
'backend': STFR_BACKEND,
'device': STFR_DEVICE,
'model_kwargs': STFR_MODEL_ARGS,
},
},
LanguageModels.SPACY: {
'func': m_load.load_spacy,
'kwargs': {
'model_name': SPACY_MODEL_NAME,
},
},
} }
stfr_model_args: STFRModelArgs
if STFR_BACKEND == STFRBackends.ONNX:
stfr_model_args = STFR_MODEL_ARGS_ONNX
else:
stfr_model_args = STFR_MODEL_ARGS_DEFAULT
STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
# ** language dependency analysis # ** language dependency analysis
# ** POS # ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])

View File

@ -1,3 +1,9 @@
# ** meta exceptions
class LanguageModelNotFoundError(Exception):
"""Error raised if a given language model could not be loaded successfully"""
# ** token graph exceptions
class EdgePropertyNotContainedError(Exception): class EdgePropertyNotContainedError(Exception):
"""Error raised if a needed edge property is not contained in graph edges""" """Error raised if a needed edge property is not contained in graph edges"""
@ -21,8 +27,6 @@ class DependencyMissingError(Exception):
# ** pipelines to perform given actions on dataset in a customisable manner # ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception): class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline""" """Error describing that no action is available in the current pipeline"""

View File

@ -1,7 +1,6 @@
import base64 import base64
import pickle import pickle
import shutil import shutil
import tomllib
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -33,15 +32,6 @@ def create_saving_folder(
) )
def load_toml_config(
path_to_toml: str | Path,
) -> dict[str, Any]:
with open(path_to_toml, 'rb') as f:
data = tomllib.load(f)
logger.info('Loaded TOML config file successfully.')
return data
# saving and loading using pickle # saving and loading using pickle
# careful: pickling from unknown sources can be dangerous # careful: pickling from unknown sources can be dangerous
def save_pickle( def save_pickle(

View File

@ -1,4 +1,6 @@
# lang_main: Config file # lang_main: Config file
[info]
pkg = 'lang_main'
[paths] [paths]
inputs = './inputs/' inputs = './inputs/'

View File

@ -5,6 +5,7 @@ from time import gmtime
from typing import Final from typing import Final
from lang_main.constants import ( from lang_main.constants import (
BASE_PATH,
ENABLE_LOGGING, ENABLE_LOGGING,
LOGGING_TO_FILE, LOGGING_TO_FILE,
LOGGING_TO_STDERR, LOGGING_TO_STDERR,
@ -15,11 +16,11 @@ from lang_main.types import LoggingLevels
logging.Formatter.converter = gmtime logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s' LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000' LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log' LOG_FILE_FOLDER: Final[Path] = BASE_PATH / 'logs'
# logging.basicConfig( if not LOG_FILE_FOLDER.exists():
# format=LOG_FMT, LOG_FILE_FOLDER.mkdir(parents=True)
# datefmt=LOG_DATE_FMT,
# ) LOG_FILE_PATH: Final[Path] = LOG_FILE_FOLDER / 'lang-main.log'
# ** formatters # ** formatters
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT) logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)

View File

@ -1,16 +1,25 @@
from __future__ import annotations from __future__ import annotations
import importlib
from typing import ( from typing import (
TYPE_CHECKING,
Any, Any,
Final,
Literal, Literal,
cast,
overload, overload,
) )
import spacy from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers import SentenceTransformer
from lang_main.constants import STFR_SIMILARITY from lang_main.constants import (
SPACY_MODEL_NAME,
STFR_BACKEND,
STFR_DEVICE,
STFR_MODEL_ARGS,
STFR_MODEL_NAME,
STFR_SIMILARITY,
)
from lang_main.errors import LanguageModelNotFoundError
from lang_main.types import ( from lang_main.types import (
LanguageModels, LanguageModels,
Model, Model,
@ -20,9 +29,6 @@ from lang_main.types import (
STFRDeviceTypes, STFRDeviceTypes,
) )
if TYPE_CHECKING:
from sentence_transformers import SimilarityFunction
@overload @overload
def instantiate_model( def instantiate_model(
@ -53,14 +59,27 @@ def instantiate_model(
def load_spacy( def load_spacy(
model_name: str, model_name: str,
) -> SpacyModel: ) -> SpacyModel:
return spacy.load(model_name) try:
spacy_model_obj = importlib.import_module(SPACY_MODEL_NAME)
except ModuleNotFoundError:
raise LanguageModelNotFoundError(
(
f'Could not find spaCy model >>{model_name}<<. '
f'Check if it is installed correctly.'
)
)
pretrained_model = cast(SpacyModel, spacy_model_obj.load())
return pretrained_model
def load_sentence_transformer( def load_sentence_transformer(
model_name: str, model_name: str,
similarity_func: SimilarityFunction = STFR_SIMILARITY, similarity_func: SimilarityFunction = SimilarityFunction.COSINE,
backend: STFRBackends = STFRBackends.TORCH, backend: STFRBackends = STFRBackends.TORCH,
device: STFRDeviceTypes = STFRDeviceTypes.CPU, device: STFRDeviceTypes = STFRDeviceTypes.CPU,
local_files_only: bool = False,
model_save_folder: str | None = None,
model_kwargs: dict[str, Any] | None = None, model_kwargs: dict[str, Any] | None = None,
) -> SentenceTransformer: ) -> SentenceTransformer:
return SentenceTransformer( return SentenceTransformer(
@ -68,5 +87,28 @@ def load_sentence_transformer(
similarity_fn_name=similarity_func, similarity_fn_name=similarity_func,
backend=backend, # type: ignore Literal matches Enum backend=backend, # type: ignore Literal matches Enum
device=device, device=device,
cache_folder=model_save_folder,
local_files_only=local_files_only,
model_kwargs=model_kwargs, model_kwargs=model_kwargs,
) )
# ** configured model builder functions
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
LanguageModels.SENTENCE_TRANSFORMER: {
'func': load_sentence_transformer,
'kwargs': {
'model_name': STFR_MODEL_NAME,
'similarity_func': STFR_SIMILARITY,
'backend': STFR_BACKEND,
'device': STFR_DEVICE,
'model_kwargs': STFR_MODEL_ARGS,
},
},
LanguageModels.SPACY: {
'func': load_spacy,
'kwargs': {
'model_name': SPACY_MODEL_NAME,
},
},
}

View File

@ -30,7 +30,6 @@ from lang_main.constants import (
DATE_COLS, DATE_COLS,
FEATURE_NAME_OBJ_ID, FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES, MODEL_INPUT_FEATURES,
MODEL_LOADER_MAP,
NAME_DELTA_FEAT_TO_REPAIR, NAME_DELTA_FEAT_TO_REPAIR,
SAVE_PATH_FOLDER, SAVE_PATH_FOLDER,
THRESHOLD_AMOUNT_CHARACTERS, THRESHOLD_AMOUNT_CHARACTERS,
@ -41,6 +40,7 @@ from lang_main.constants import (
THRESHOLD_UNIQUE_TEXTS, THRESHOLD_UNIQUE_TEXTS,
UNIQUE_CRITERION_FEATURE, UNIQUE_CRITERION_FEATURE,
) )
from lang_main.model_loader import MODEL_LOADER_MAP
from lang_main.pipelines.base import Pipeline from lang_main.pipelines.base import Pipeline
from lang_main.types import EntryPoints, LanguageModels from lang_main.types import EntryPoints, LanguageModels

View File

@ -45,13 +45,20 @@ class ONNXExecutionProvider(enum.StrEnum):
CPU = 'CPUExecutionProvider' CPU = 'CPUExecutionProvider'
class STFRModels(enum.StrEnum): class STFRModelTypes(enum.StrEnum):
ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2' ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1' ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2' ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2' ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
class SpacyModelTypes(enum.StrEnum):
DE_CORE_NEWS_SM = 'de_core_news_sm'
DE_CORE_NEWS_MD = 'de_core_news_md'
DE_CORE_NEWS_LG = 'de_core_news_lg'
DE_DEP_NEWS_TRF = 'de_dep_news_trf'
class STFRQuantFilenames(enum.StrEnum): class STFRQuantFilenames(enum.StrEnum):
ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx' ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'

File diff suppressed because it is too large Load Diff

BIN
tests/analyse_dataset.xlsx Normal file

Binary file not shown.

View File

View File

@ -0,0 +1,168 @@
import networkx as nx
import pytest
from lang_main.analysis import graphs
TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
def build_init_graph(token_graph: bool):
edge_weights = [
{'weight': 1},
{'weight': 2},
{'weight': 3},
{'weight': 4},
{'weight': 5},
{'weight': 6},
]
edges = [
(1, 2),
(1, 3),
(2, 4),
(3, 4),
(1, 4),
(2, 1),
]
edges_to_add = []
for i, edge in enumerate(edges):
edge = list(edge)
edge.append(edge_weights[i]) # type: ignore
edges_to_add.append(tuple(edge))
if token_graph:
G = graphs.TokenGraph(name=TK_GRAPH_NAME, enable_logging=False)
else:
G = nx.DiGraph()
G.add_edges_from(edges_to_add)
return G
@pytest.fixture(scope='module')
def graph():
return build_init_graph(token_graph=False)
@pytest.fixture(scope='module')
def tk_graph():
return build_init_graph(token_graph=True)
def test_graph_size(graph):
assert len(graph.nodes) == 4
assert len(graph.edges) == 6
def test_save_to_GraphML(graph, tmp_path):
filename = 'test_graphML'
graphs.save_to_GraphML(graph, saving_path=tmp_path, filename=filename)
saved_file = (tmp_path / filename).with_suffix('.graphml')
assert saved_file.exists()
def test_metadata_retrieval(graph):
metadata = graphs.get_graph_metadata(graph)
assert metadata['num_nodes'] == 4
assert metadata['num_edges'] == 6
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 6
assert metadata['node_memory'] == 112
assert metadata['edge_memory'] == 336
assert metadata['total_memory'] == 448
def test_graph_update_batch():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 6
assert metadata['num_edges'] == 8
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 8
def test_graph_update_single_new():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 5
assert metadata['num_edges'] == 7
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 7
def test_graph_update_single_existing():
graph_obj = build_init_graph(token_graph=False)
graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
metadata = graphs.get_graph_metadata(graph_obj)
assert metadata['num_nodes'] == 4
assert metadata['num_edges'] == 6
assert metadata['min_edge_weight'] == 1
assert metadata['max_edge_weight'] == 10
@pytest.mark.parametrize('cast_int', [True, False])
def test_graph_undirected_conversion(graph, cast_int):
graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
# edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
def test_graph_cytoscape_conversion(graph):
cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
node = cyto_graph[0]
edge = cyto_graph[-1]
assert node['data']['id'] == 1 # type: ignore
assert edge['data']['source'] == 3 # type: ignore
assert edge['data']['target'] == 4 # type: ignore
assert edge['data']['weight'] == 4 # type: ignore
assert weight_data['min'] == 1
assert weight_data['max'] == 6
def test_tk_graph_properties(tk_graph):
assert tk_graph.name == TK_GRAPH_NAME
assert isinstance(tk_graph.directed, graphs.TokenGraph)
assert isinstance(tk_graph.undirected, nx.Graph)
tk_graph.update_metadata()
metadata_directed = tk_graph.metadata_directed
assert metadata_directed['num_nodes'] == 4
assert metadata_directed['num_edges'] == 6
assert metadata_directed['min_edge_weight'] == 1
assert metadata_directed['max_edge_weight'] == 6
assert metadata_directed['node_memory'] == 112
assert metadata_directed['edge_memory'] == 336
assert metadata_directed['total_memory'] == 448
metadata_undirected = tk_graph.metadata_undirected
assert metadata_undirected['num_nodes'] == 4
assert metadata_undirected['num_edges'] == 5
assert metadata_undirected['min_edge_weight'] == 2
assert metadata_undirected['max_edge_weight'] == 7
assert metadata_undirected['node_memory'] == 112
assert metadata_undirected['edge_memory'] == 280
assert metadata_undirected['total_memory'] == 392
def test_graph_degree_filter(tk_graph):
filtered_graph = graphs.filter_graph_by_node_degree(
tk_graph,
bound_lower=3,
bound_upper=3,
)
assert len(filtered_graph.nodes) == 2
def test_graph_edge_number_filter(tk_graph):
number_edges_limit = 1
filtered_graph = graphs.filter_graph_by_number_edges(
tk_graph,
limit=number_edges_limit,
)
assert len(filtered_graph.edges) == number_edges_limit
filtered_graph = graphs.filter_graph_by_node_degree(
filtered_graph,
bound_lower=1,
bound_upper=None,
)
assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'

View File

@ -0,0 +1,73 @@
"""testing each function in a consecutive way like each one is
executed in in a pipeline
"""
from lang_main.analysis import preprocessing as ppc
from lang_main.analysis import shared
def test_load_data(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
assert len(data) == 1000
def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
assert len(data) == 999
def test_remove_na(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
assert len(data) == 998
# def test_string_cleansing():
# string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
# cleaned_string = shared.clean_string_slim(string)
# target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
# assert cleaned_string == target_string
def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
cleaned_string = shared.clean_string_slim(starting_string)
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
assert cleaned_string == target_string
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
(data,) = shared.entry_wise_cleansing(
data,
target_features=target_features,
cleansing_func=shared.clean_string_slim,
)
assert data.at[0, 'VorgangsBeschreibung'] == target_string
def test_analyse_feature(raw_data_path, raw_data_date_cols):
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
(data,) = ppc.remove_duplicates(data)
target_features: tuple[str] = ('VorgangsBeschreibung',)
(data,) = ppc.remove_NA(data, target_features)
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
cleaned_string = shared.clean_string_slim(starting_string)
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
assert cleaned_string == target_string
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
(data,) = shared.entry_wise_cleansing(
data,
target_features=target_features,
cleansing_func=shared.clean_string_slim,
)
assert data.at[0, 'VorgangsBeschreibung'] == target_string
(data,) = ppc.analyse_feature(data, target_feature=target_features[0])
assert len(data) == 139

23
tests/conftest.py Normal file
View File

@ -0,0 +1,23 @@
from pathlib import Path
import pytest
DATE_COLS: tuple[str, ...] = (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
)
@pytest.fixture(scope='session')
def raw_data_path():
pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
assert pth_data.exists()
return pth_data
@pytest.fixture(scope='session')
def raw_data_date_cols():
return DATE_COLS

View File

@ -1,56 +0,0 @@
# lang_main: Config file
[paths]
inputs = '../scripts/inputs/'
results = '../scripts/results/test_new2/'
dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

7
tests/test_config.py Normal file
View File

@ -0,0 +1,7 @@
from lang_main import config, pkg_dir
def test_load_config():
toml_path = pkg_dir / 'lang_main_config.toml'
loaded_cfg = config.load_toml_config(toml_path)
assert loaded_cfg['info']['pkg'] == 'lang_main'

57
tests/test_io.py Normal file
View File

@ -0,0 +1,57 @@
import pytest
from lang_main import io
CONTENT = 'test_lang_main'
@pytest.mark.parametrize(
'overwrite',
[True, False],
)
def test_create_saving_folder(tmp_path, overwrite):
target_dir = tmp_path / 'test'
assert not target_dir.exists()
io.create_saving_folder(target_dir, overwrite_existing=overwrite)
assert target_dir.exists()
assert target_dir.is_dir()
def test_save_load(tmp_path):
save_pth = tmp_path / 'test_lang_main.pkl'
io.save_pickle(CONTENT, save_pth)
loaded = io.load_pickle(save_pth)
assert loaded == CONTENT
b64_str = io.encode_to_base64_str(CONTENT)
b64_str_file = io.encode_file_to_base64_str(save_pth)
assert b64_str == b64_str_file
b64_decoded = io.decode_from_base64_str(b64_str)
assert b64_decoded == CONTENT
b64_decoded_file = io.decode_from_base64_str(b64_str_file)
assert b64_decoded_file == CONTENT
def test_get_entry_point(tmp_path):
save_pth = tmp_path / 'test_lang_main.pkl'
io.save_pickle(CONTENT, save_pth)
pth = io.get_entry_point(
tmp_path,
'test_lang_main',
'.pkl',
check_existence=True,
)
assert pth.exists()
with pytest.raises(FileNotFoundError):
_ = io.get_entry_point(
tmp_path,
'test_lang_main2',
'.pkl',
check_existence=True,
)
pth = io.get_entry_point(
tmp_path,
'test_lang_main2',
'.pkl',
check_existence=False,
)
assert not pth.exists()

View File

@ -0,0 +1,5 @@
from lang_main import BASE_PATH
def test_base_path():
assert BASE_PATH is not None

113
tests/test_model_loader.py Normal file
View File

@ -0,0 +1,113 @@
import pytest
from sentence_transformers import SentenceTransformer
from spacy.language import Language
from lang_main import model_loader
from lang_main.constants import (
STFR_MODEL_ARGS_ONNX,
SimilarityFunction,
SpacyModelTypes,
STFRBackends,
STFRDeviceTypes,
STFRModelTypes,
)
from lang_main.types import LanguageModels
@pytest.mark.parametrize(
'similarity_func',
[
SimilarityFunction.COSINE,
SimilarityFunction.DOT,
],
)
@pytest.mark.parametrize(
'model_name',
[
STFRModelTypes.ALL_DISTILROBERTA_V1,
STFRModelTypes.ALL_MINI_LM_L12_V2,
STFRModelTypes.ALL_MINI_LM_L6_V2,
STFRModelTypes.ALL_MPNET_BASE_V2,
],
)
@pytest.mark.mload
def test_load_sentence_transformer(
model_name,
similarity_func,
) -> None:
model = model_loader.load_sentence_transformer(
model_name=model_name,
similarity_func=similarity_func,
backend=STFRBackends.TORCH,
device=STFRDeviceTypes.CPU,
model_kwargs=None,
)
assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
'similarity_func',
[
SimilarityFunction.COSINE,
SimilarityFunction.DOT,
],
)
@pytest.mark.parametrize(
'model_name',
[
STFRModelTypes.ALL_DISTILROBERTA_V1,
STFRModelTypes.ALL_MINI_LM_L12_V2,
STFRModelTypes.ALL_MINI_LM_L6_V2,
STFRModelTypes.ALL_MPNET_BASE_V2,
],
)
@pytest.mark.mload
def test_load_sentence_transformer_onnx(
model_name,
similarity_func,
) -> None:
model = model_loader.load_sentence_transformer(
model_name=model_name,
similarity_func=similarity_func,
backend=STFRBackends.ONNX,
device=STFRDeviceTypes.CPU,
model_kwargs=STFR_MODEL_ARGS_ONNX, # type: ignore
)
assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
'model_name',
[
SpacyModelTypes.DE_CORE_NEWS_SM,
SpacyModelTypes.DE_CORE_NEWS_MD,
SpacyModelTypes.DE_CORE_NEWS_LG,
SpacyModelTypes.DE_DEP_NEWS_TRF,
],
)
@pytest.mark.mload
def test_load_spacy_model(
model_name,
):
model = model_loader.load_spacy(
model_name=model_name,
)
assert isinstance(model, Language)
@pytest.mark.mload
def test_instantiate_spacy_model():
model = model_loader.instantiate_model(
model_load_map=model_loader.MODEL_LOADER_MAP,
model=LanguageModels.SPACY,
)
assert isinstance(model, Language)
@pytest.mark.mload
def test_instantiate_stfr_model():
model = model_loader.instantiate_model(
model_load_map=model_loader.MODEL_LOADER_MAP,
model=LanguageModels.SENTENCE_TRANSFORMER,
)
assert isinstance(model, SentenceTransformer)