started adding comprehensive unit tests

This commit is contained in:
Florian Förster
2024-11-13 17:54:47 +01:00
parent a0ca71ea87
commit 6781b4a132
32 changed files with 4042 additions and 1430 deletions

View File

@@ -1,58 +0,0 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './results/'
dataset = '../data/02_202307/Export4.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_number = 300
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@@ -19,6 +19,900 @@
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
"metadata": {},
"outputs": [],
"source": [
"edge_weighst = [\n",
" {'weight': 1},\n",
" {'weight': 2},\n",
" {'weight': 3},\n",
" {'weight': 4},\n",
" {'weight': 5},\n",
" {'weight': 6},\n",
"]\n",
"edges = [\n",
" (1, 2),\n",
" (1, 3),\n",
" (2, 4),\n",
" (3, 4),\n",
" (1, 4),\n",
" (2, 1),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
"metadata": {},
"outputs": [],
"source": [
"edges_to_add = []\n",
"for i, edge in enumerate(edges):\n",
" edge = list(edge)\n",
" edge.append(edge_weighst[i])\n",
" edges_to_add.append(tuple(edge))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 2, {'weight': 1}),\n",
" (1, 3, {'weight': 2}),\n",
" (2, 4, {'weight': 3}),\n",
" (3, 4, {'weight': 4}),\n",
" (1, 4, {'weight': 5}),\n",
" (2, 1, {'weight': 6})]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"edges_to_add"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
"metadata": {},
"outputs": [],
"source": [
"G = nx.DiGraph()\n",
"G.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "91d4094b-f886-4056-a697-5223f157f1d3",
"metadata": {},
"outputs": [],
"source": [
"tk = graphs.TokenGraph()\n",
"tk.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "518cada9-561a-4b96-b750-3d500d1d28b9",
"metadata": {},
"outputs": [],
"source": [
"from lang_main.analysis import graphs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 4,\n",
" 'num_edges': 6,\n",
" 'min_edge_weight': 1,\n",
" 'max_edge_weight': 6,\n",
" 'node_memory': 112,\n",
" 'edge_memory': 336,\n",
" 'total_memory': 448}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphs.get_graph_metadata(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca2ce8e8-d72a-4edf-ae42-0f79bd9d19a2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 38,
"id": "223dc592-fa56-4536-a5c2-a166001a6aca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4\n",
"1 0.0 1.0 2.0 5.0\n",
"2 6.0 0.0 0.0 3.0\n",
"3 0.0 0.0 0.0 4.0\n",
"4 0.0 0.0 0.0 0.0"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nx.to_pandas_adjacency(G)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "1f677da0-7416-413c-adb1-ae1384e09349",
"metadata": {},
"outputs": [],
"source": [
"G_undir = graphs.convert_graph_to_undirected(G, cast_int=False)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "356862fb-2383-43d9-80ba-4fe83646c9d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5.0</td>\n",
" <td>3.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4\n",
"1 0.0 7.0 2.0 5.0\n",
"2 7.0 0.0 0.0 3.0\n",
"3 2.0 0.0 0.0 4.0\n",
"4 5.0 3.0 4.0 0.0"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nx.to_pandas_adjacency(G_undir)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "b8a3db1a-0d2a-4635-ab88-7802e2cf59e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_undir.is_directed()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "46001528-75b0-4fe8-a3ec-353bbd3eeeff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'weight': 7.0}"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G_undir[1][2]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "cf2dcdff-f0b7-416e-9db3-c7a21ea96b96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"([{'data': {'id': 1, 'label': 1}},\n",
" {'data': {'id': 2, 'label': 2}},\n",
" {'data': {'id': 3, 'label': 3}},\n",
" {'data': {'id': 4, 'label': 4}},\n",
" {'data': {'source': 1, 'target': 2, 'weight': 1}},\n",
" {'data': {'source': 1, 'target': 3, 'weight': 2}},\n",
" {'data': {'source': 1, 'target': 4, 'weight': 5}},\n",
" {'data': {'source': 2, 'target': 4, 'weight': 3}},\n",
" {'data': {'source': 2, 'target': 1, 'weight': 6}},\n",
" {'data': {'source': 3, 'target': 4, 'weight': 4}}],\n",
" {'min': 1, 'max': 6})"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphs.convert_graph_to_cytoscape(G)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f82481e9-873f-4657-80d3-ba75af74fa27",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 4, number of edges: 6)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk.update_metadata()\n",
"tk"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4b806620-b469-45ef-823b-db46f8590509",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(G.degree)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a41d019-1b6b-46f7-b13e-ac22da737940",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G.degree[1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e99f2fb4-4c8d-4564-810d-a4b2ed9d6009",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(tk.degree)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1368ebf6-e008-492d-8d15-fe3ed12b78a3",
"metadata": {},
"outputs": [],
"source": [
"g_filt = graphs.filter_graph_by_node_degree(\n",
" tk,\n",
" bound_lower=3,\n",
" bound_upper=3,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "de69f73e-da1d-4479-81da-006f2ce61844",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 2, number of edges: 1)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g_filt"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "b12fd64d-737e-4c68-94ea-72a817647a04",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2, 4]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(g_filt.nodes)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "21434c7c-887c-4f9f-884a-48514e2279e0",
"metadata": {},
"outputs": [],
"source": [
"G = nx.DiGraph()\n",
"G.add_edges_from(edges_to_add)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2defef69-f09a-4869-984a-27b6373b17b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'weight': 1}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"G[1][2]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "0308a2ac-f554-4e24-9ddb-578dd588f3c8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(G.edges)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15d9ce65-f9a5-40de-a737-098579f6a8ee",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7acf4be7-45f3-45e6-87f5-14343f23d610",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9139812b-74ba-45ce-adfc-e57667259692",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loaded TOML config file successfully.\n"
]
}
],
"source": [
"from lang_main import search_iterative, search_base_path\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "35118922-3a17-4698-93bc-5292a276a4b4",
"metadata": {},
"outputs": [],
"source": [
"from lang_main import constants"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "341299bf-e926-4e55-8545-8805a186f49c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/lang-models')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"constants.MODEL_BASE_FOLDER"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "11ce4062-b229-4d88-967d-6eeb6d0135b7",
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "40dac543-1e53-4fd8-a192-88f3527872b2",
"metadata": {},
"outputs": [],
"source": [
"model_kwargs = {\n",
" 'file_name': 'onnx/model_quint8_avx2.onnx',\n",
" 'provider': 'CPUExecutionProvider',\n",
" 'export': False,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8a0eaa9f-e2d2-4106-b80b-80916e9d8bfe",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "215b46e3607e4530b2d8f8227367ef23",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model_quint8_avx2.onnx: 0%| | 0.00/23.0M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in A:\\Arbeitsaufgaben\\lang-models\\models--sentence-transformers--all-MiniLM-L6-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n"
]
}
],
"source": [
"stfr = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine', backend='onnx', model_kwargs=model_kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cd921aca-0673-41ec-98a3-18e360a39a41",
"metadata": {},
"outputs": [],
"source": [
"from lang_main.constants import SPACY_MODEL_NAME"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "88173a68-7d8e-4f4c-a4ad-bbf78efaf781",
"metadata": {},
"outputs": [],
"source": [
"import importlib"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e5293976-22ab-406a-ba32-066fd7254394",
"metadata": {},
"outputs": [],
"source": [
"mod = importlib.import_module(SPACY_MODEL_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6023a339-02da-429c-acf5-f14a56989357",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<module 'de_dep_news_trf' from 'A:\\\\Arbeitsaufgaben\\\\lang-main\\\\.venv\\\\Lib\\\\site-packages\\\\de_dep_news_trf\\\\__init__.py'>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mod"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5f4fa066-fa0f-4818-9cf9-ec28923150ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loaded TOML config file successfully.\n"
]
}
],
"source": [
"from lang_main.analysis.shared import clean_string_slim"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "71286836-7eb2-4095-ab82-42d7ac7ed476",
"metadata": {},
"outputs": [],
"source": [
"string = 'Ölleckage durch\\nundichten \\t Ölsumpf,, aber Dichtung intakt??!!!'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e8284e76-e750-458e-bb63-d59d6d57a396",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ölleckage durch\n",
"undichten \t Ölsumpf,, aber Dichtung intakt??!!!\n"
]
}
],
"source": [
"print(string)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "82e98d8f-2e24-42f9-a3ed-3b3454ae64f4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_string_slim(string)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b527b145-15d2-4961-b441-1843fe9f5c29",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "49c2e2f0-1e6d-4969-b583-8fc15b8930f9",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "16ae5d5c-a0a7-400b-8e38-231c72ad27b5",
"metadata": {},
"outputs": [],
"source": [
"pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "3b9fe636-f895-404a-819d-61198d34262d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Am war ich essen. Am hingegen nicht. Und war ich allein.'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string = 'Am 11.02.2024 war ich essen. Am 11.12. hingegen nicht. Und 2024 war ich allein.'\n",
"string = pattern_dates.sub('', string)\n",
"string"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c49ab3c-e860-42af-ac0c-2f44f075e846",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,