started adding comprehensive unit tests
This commit is contained in:
parent
a0ca71ea87
commit
6781b4a132
@ -2,49 +2,56 @@
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './results/test_20240807/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
preprocessing_skip = true
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = false
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
threshold_edge_number = 330
|
||||
# threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
|
||||
@ -1,58 +0,0 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './results/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
preprocessing_skip = true
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = false
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_number = 300
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
@ -19,6 +19,900 @@
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import networkx as nx"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"edge_weighst = [\n",
|
||||
" {'weight': 1},\n",
|
||||
" {'weight': 2},\n",
|
||||
" {'weight': 3},\n",
|
||||
" {'weight': 4},\n",
|
||||
" {'weight': 5},\n",
|
||||
" {'weight': 6},\n",
|
||||
"]\n",
|
||||
"edges = [\n",
|
||||
" (1, 2),\n",
|
||||
" (1, 3),\n",
|
||||
" (2, 4),\n",
|
||||
" (3, 4),\n",
|
||||
" (1, 4),\n",
|
||||
" (2, 1),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"edges_to_add = []\n",
|
||||
"for i, edge in enumerate(edges):\n",
|
||||
" edge = list(edge)\n",
|
||||
" edge.append(edge_weighst[i])\n",
|
||||
" edges_to_add.append(tuple(edge))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 2, {'weight': 1}),\n",
|
||||
" (1, 3, {'weight': 2}),\n",
|
||||
" (2, 4, {'weight': 3}),\n",
|
||||
" (3, 4, {'weight': 4}),\n",
|
||||
" (1, 4, {'weight': 5}),\n",
|
||||
" (2, 1, {'weight': 6})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"edges_to_add"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G = nx.DiGraph()\n",
|
||||
"G.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "91d4094b-f886-4056-a697-5223f157f1d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tk = graphs.TokenGraph()\n",
|
||||
"tk.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "518cada9-561a-4b96-b750-3d500d1d28b9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main.analysis import graphs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'num_nodes': 4,\n",
|
||||
" 'num_edges': 6,\n",
|
||||
" 'min_edge_weight': 1,\n",
|
||||
" 'max_edge_weight': 6,\n",
|
||||
" 'node_memory': 112,\n",
|
||||
" 'edge_memory': 336,\n",
|
||||
" 'total_memory': 448}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graphs.get_graph_metadata(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ca2ce8e8-d72a-4edf-ae42-0f79bd9d19a2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "223dc592-fa56-4536-a5c2-a166001a6aca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 1 2 3 4\n",
|
||||
"1 0.0 1.0 2.0 5.0\n",
|
||||
"2 6.0 0.0 0.0 3.0\n",
|
||||
"3 0.0 0.0 0.0 4.0\n",
|
||||
"4 0.0 0.0 0.0 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nx.to_pandas_adjacency(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "1f677da0-7416-413c-adb1-ae1384e09349",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G_undir = graphs.convert_graph_to_undirected(G, cast_int=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "356862fb-2383-43d9-80ba-4fe83646c9d9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 1 2 3 4\n",
|
||||
"1 0.0 7.0 2.0 5.0\n",
|
||||
"2 7.0 0.0 0.0 3.0\n",
|
||||
"3 2.0 0.0 0.0 4.0\n",
|
||||
"4 5.0 3.0 4.0 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nx.to_pandas_adjacency(G_undir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "b8a3db1a-0d2a-4635-ab88-7802e2cf59e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G_undir.is_directed()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "46001528-75b0-4fe8-a3ec-353bbd3eeeff",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'weight': 7.0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G_undir[1][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "cf2dcdff-f0b7-416e-9db3-c7a21ea96b96",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"([{'data': {'id': 1, 'label': 1}},\n",
|
||||
" {'data': {'id': 2, 'label': 2}},\n",
|
||||
" {'data': {'id': 3, 'label': 3}},\n",
|
||||
" {'data': {'id': 4, 'label': 4}},\n",
|
||||
" {'data': {'source': 1, 'target': 2, 'weight': 1}},\n",
|
||||
" {'data': {'source': 1, 'target': 3, 'weight': 2}},\n",
|
||||
" {'data': {'source': 1, 'target': 4, 'weight': 5}},\n",
|
||||
" {'data': {'source': 2, 'target': 4, 'weight': 3}},\n",
|
||||
" {'data': {'source': 2, 'target': 1, 'weight': 6}},\n",
|
||||
" {'data': {'source': 3, 'target': 4, 'weight': 4}}],\n",
|
||||
" {'min': 1, 'max': 6})"
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graphs.convert_graph_to_cytoscape(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f82481e9-873f-4657-80d3-ba75af74fa27",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"TokenGraph(name: TokenGraph, number of nodes: 4, number of edges: 6)"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tk.update_metadata()\n",
|
||||
"tk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"id": "4b806620-b469-45ef-823b-db46f8590509",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(G.degree)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"id": "2a41d019-1b6b-46f7-b13e-ac22da737940",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4"
|
||||
]
|
||||
},
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G.degree[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e99f2fb4-4c8d-4564-810d-a4b2ed9d6009",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(tk.degree)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "1368ebf6-e008-492d-8d15-fe3ed12b78a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"g_filt = graphs.filter_graph_by_node_degree(\n",
|
||||
" tk,\n",
|
||||
" bound_lower=3,\n",
|
||||
" bound_upper=3,\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "de69f73e-da1d-4479-81da-006f2ce61844",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"TokenGraph(name: TokenGraph, number of nodes: 2, number of edges: 1)"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"g_filt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "b12fd64d-737e-4c68-94ea-72a817647a04",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[2, 4]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(g_filt.nodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "21434c7c-887c-4f9f-884a-48514e2279e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G = nx.DiGraph()\n",
|
||||
"G.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "2defef69-f09a-4869-984a-27b6373b17b9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'weight': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G[1][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "0308a2ac-f554-4e24-9ddb-578dd588f3c8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"5"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(G.edges)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15d9ce65-f9a5-40de-a737-098579f6a8ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7acf4be7-45f3-45e6-87f5-14343f23d610",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9139812b-74ba-45ce-adfc-e57667259692",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded TOML config file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lang_main import search_iterative, search_base_path\n",
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "35118922-3a17-4698-93bc-5292a276a4b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main import constants"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "341299bf-e926-4e55-8545-8805a186f49c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"WindowsPath('A:/Arbeitsaufgaben/lang-models')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"constants.MODEL_BASE_FOLDER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "11ce4062-b229-4d88-967d-6eeb6d0135b7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sentence_transformers import SentenceTransformer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "40dac543-1e53-4fd8-a192-88f3527872b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_kwargs = {\n",
|
||||
" 'file_name': 'onnx/model_quint8_avx2.onnx',\n",
|
||||
" 'provider': 'CPUExecutionProvider',\n",
|
||||
" 'export': False,\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "8a0eaa9f-e2d2-4106-b80b-80916e9d8bfe",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "215b46e3607e4530b2d8f8227367ef23",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"model_quint8_avx2.onnx: 0%| | 0.00/23.0M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in A:\\Arbeitsaufgaben\\lang-models\\models--sentence-transformers--all-MiniLM-L6-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
||||
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
||||
" warnings.warn(message)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stfr = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine', backend='onnx', model_kwargs=model_kwargs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "cd921aca-0673-41ec-98a3-18e360a39a41",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main.constants import SPACY_MODEL_NAME"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "88173a68-7d8e-4f4c-a4ad-bbf78efaf781",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import importlib"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e5293976-22ab-406a-ba32-066fd7254394",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mod = importlib.import_module(SPACY_MODEL_NAME)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6023a339-02da-429c-acf5-f14a56989357",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<module 'de_dep_news_trf' from 'A:\\\\Arbeitsaufgaben\\\\lang-main\\\\.venv\\\\Lib\\\\site-packages\\\\de_dep_news_trf\\\\__init__.py'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mod"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5f4fa066-fa0f-4818-9cf9-ec28923150ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded TOML config file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lang_main.analysis.shared import clean_string_slim"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "71286836-7eb2-4095-ab82-42d7ac7ed476",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"string = 'Ölleckage durch\\nundichten \\t Ölsumpf,, aber Dichtung intakt??!!!'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e8284e76-e750-458e-bb63-d59d6d57a396",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ölleckage durch\n",
|
||||
"undichten \t Ölsumpf,, aber Dichtung intakt??!!!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(string)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "82e98d8f-2e24-42f9-a3ed-3b3454ae64f4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clean_string_slim(string)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b527b145-15d2-4961-b441-1843fe9f5c29",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "49c2e2f0-1e6d-4969-b583-8fc15b8930f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "16ae5d5c-a0a7-400b-8e38-231c72ad27b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "3b9fe636-f895-404a-819d-61198d34262d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Am war ich essen. Am hingegen nicht. Und war ich allein.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"string = 'Am 11.02.2024 war ich essen. Am 11.12. hingegen nicht. Und 2024 war ich allein.'\n",
|
||||
"string = pattern_dates.sub('', string)\n",
|
||||
"string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c49ab3c-e860-42af-ac0c-2f44f075e846",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
|
||||
@ -15,6 +15,7 @@ dependencies = [
|
||||
"typing-extensions>=4.12.2",
|
||||
"tqdm>=4.67.0",
|
||||
"python-dateutil>=2.9.0.post0",
|
||||
"onnx==1.16.1",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
readme = "README.md"
|
||||
@ -33,6 +34,18 @@ plot = [
|
||||
cytoscape = [
|
||||
"py4cytoscape>=1.11.0",
|
||||
]
|
||||
spacy-trf = [
|
||||
"de-dep-news-trf @ https://github.com/explosion/spacy-models/releases/download/de_dep_news_trf-3.8.0/de_dep_news_trf-3.8.0-py3-none-any.whl",
|
||||
]
|
||||
spacy-sm = [
|
||||
"de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl",
|
||||
]
|
||||
spacy-md = [
|
||||
"de-core-news-md @ https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.8.0/de_core_news_md-3.8.0-py3-none-any.whl",
|
||||
]
|
||||
spacy-lg = [
|
||||
"de-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0-py3-none-any.whl",
|
||||
]
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
@ -57,6 +70,8 @@ dev = [
|
||||
"cython>=3.0.10",
|
||||
"openpyxl>=3.1.5",
|
||||
"seaborn>=0.13.2",
|
||||
"pytest>=8.3.3",
|
||||
"pytest-cov>=6.0.0",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
@ -73,4 +88,37 @@ skip-magic-trailing-comma = false
|
||||
select = ["E", "F", "I"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
extra-standard-library = ["typing_extensions"]
|
||||
extra-standard-library = ["typing_extensions"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = [
|
||||
"-vvl",
|
||||
"--import-mode=importlib",
|
||||
]
|
||||
testpaths = [
|
||||
"tests",
|
||||
]
|
||||
filterwarnings = [
|
||||
'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
|
||||
]
|
||||
markers = [
|
||||
"mload: marks tests with loading of language models (deselect with '-m \"not mload\"')",
|
||||
]
|
||||
log_cli = true
|
||||
|
||||
[tool.coverage.run]
|
||||
relative_files = true
|
||||
source = [
|
||||
"lang_main",
|
||||
"tests/",
|
||||
]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_also = [
|
||||
"def __repr__",
|
||||
"def __str__",
|
||||
"@overload",
|
||||
]
|
||||
|
||||
[tool.coverage.html]
|
||||
directory = "reports/coverage"
|
||||
1
python/README.txt
Normal file
1
python/README.txt
Normal file
@ -0,0 +1 @@
|
||||
only used to simulate directory tree in final solution
|
||||
@ -1,51 +0,0 @@
|
||||
import inspect
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Any, Final
|
||||
import warnings
|
||||
|
||||
from lang_main.io import load_toml_config
|
||||
|
||||
__all__ = [
|
||||
'CALLER_PATH',
|
||||
]
|
||||
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout,
|
||||
format=LOG_FMT,
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||
pkg_dir = Path(__file__).parent
|
||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||
caller_file = Path(inspect.stack()[-1].filename)
|
||||
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
|
||||
|
||||
# load config data: internal/external
|
||||
if USE_INTERNAL_CONFIG:
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||
else:
|
||||
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
|
||||
if not caller_file.exists():
|
||||
warnings.warn('Caller file could not be correctly retrieved.')
|
||||
if not cfg_path_external.exists():
|
||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||
sys.exit(
|
||||
(
|
||||
'No config file was found. A new one with default values was created '
|
||||
'in the execution path. Please fill in the necessary values and '
|
||||
'restart the programm.'
|
||||
)
|
||||
)
|
||||
# raise NotImplementedError("External config data not implemented yet.")
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||
|
||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||
@ -1,14 +1,19 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Final
|
||||
|
||||
from lang_main.config import load_toml_config
|
||||
|
||||
_has_py4cyto: bool = True
|
||||
try:
|
||||
import py4cytoscape as p4c
|
||||
except ImportError:
|
||||
_has_py4cyto = False
|
||||
|
||||
from lang_main.io import load_toml_config
|
||||
# ** external packages config
|
||||
# ** Huggingface Hub caching
|
||||
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
|
||||
|
||||
# ** py4cytoscape config
|
||||
if _has_py4cyto:
|
||||
@ -20,6 +25,7 @@ if _has_py4cyto:
|
||||
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
|
||||
|
||||
# ** lang-main config
|
||||
BASE_FOLDERNAME: Final[str] = 'lang-main'
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
|
||||
PREFER_INTERNAL_CONFIG: Final[bool] = False
|
||||
@ -75,27 +81,71 @@ def search_iterative(
|
||||
pattern to look for, first match will be returned,
|
||||
by default CONFIG_FILENAME
|
||||
stop_folder_name : str, optional
|
||||
name of the last folder in the directory tree to search, by default 'python'
|
||||
name of the last folder in the directory tree to search, by default None
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path if corresponding object was found, None otherwise
|
||||
"""
|
||||
cfg_path: Path | None = None
|
||||
file_path: Path | None = None
|
||||
stop_folder_reached: bool = False
|
||||
for it in range(len(starting_path.parents)):
|
||||
search_path = starting_path.parents[it] # do not look in library folder
|
||||
res = tuple(search_path.glob(glob_pattern))
|
||||
if res:
|
||||
cfg_path = res[0]
|
||||
file_path = res[0]
|
||||
break
|
||||
elif stop_folder_reached:
|
||||
break
|
||||
|
||||
if stop_folder_name is not None and search_path.name == stop_folder_name:
|
||||
# library is placed inside a whole python installation for deployment
|
||||
# if this folder is reached, only look up one parent above
|
||||
stop_folder_reached = True
|
||||
|
||||
return file_path
|
||||
|
||||
|
||||
def search_base_path(
|
||||
starting_path: Path,
|
||||
stop_folder_name: str | None = None,
|
||||
) -> Path | None:
|
||||
"""Iteratively searches the parent directories of the starting path
|
||||
and look for folders matching the given name. If a match is encountered,
|
||||
the parent path will be returned.
|
||||
|
||||
Example:
|
||||
starting_path = path/to/start/folder
|
||||
stop_folder_name = 'to'
|
||||
returned path = 'path/'
|
||||
|
||||
Parameters
|
||||
----------
|
||||
starting_path : Path
|
||||
non-inclusive starting path
|
||||
stop_folder_name : str, optional
|
||||
name of the last folder in the directory tree to search, by default None
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path if corresponding base path was found, None otherwise
|
||||
"""
|
||||
stop_folder_path: Path | None = None
|
||||
base_path: Path | None = None
|
||||
for it in range(len(starting_path.parents)):
|
||||
search_path = starting_path.parents[it] # do not look in library folder
|
||||
if stop_folder_name is not None and search_path.name == stop_folder_name:
|
||||
# library is placed inside a whole python installation for deployment
|
||||
# only look up to this folder
|
||||
stop_folder_path = search_path
|
||||
break
|
||||
|
||||
return cfg_path
|
||||
if stop_folder_path is not None:
|
||||
base_path = stop_folder_path.parent
|
||||
|
||||
return base_path
|
||||
|
||||
|
||||
def load_cfg() -> dict[str, Any]:
|
||||
@ -121,6 +171,10 @@ def load_cfg() -> dict[str, Any]:
|
||||
|
||||
|
||||
CONFIG: Final[dict[str, Any]] = load_cfg()
|
||||
base_parent_path = search_base_path(pkg_dir, stop_folder_name=BASE_FOLDERNAME)
|
||||
if base_parent_path is None:
|
||||
raise FileNotFoundError('Could not resolve base path of library')
|
||||
BASE_PATH: Final[Path] = base_parent_path
|
||||
|
||||
|
||||
# ** Cytoscape configuration
|
||||
|
||||
@ -48,9 +48,9 @@ def save_to_GraphML(
|
||||
def get_graph_metadata(
|
||||
graph: Graph | DiGraph,
|
||||
logging: bool = LOGGING_DEFAULT_GRAPHS,
|
||||
) -> dict[str, int]:
|
||||
) -> dict[str, float]:
|
||||
# info about graph
|
||||
graph_info: dict[str, int] = {}
|
||||
graph_info: dict[str, float] = {}
|
||||
# nodes and edges
|
||||
num_nodes = len(graph.nodes)
|
||||
num_edges = len(graph.edges)
|
||||
@ -96,15 +96,6 @@ def update_graph(
|
||||
child: Hashable | None = None,
|
||||
weight_connection: int | None = None,
|
||||
) -> None:
|
||||
# !! not necessary to check for existence of nodes
|
||||
# !! feature already implemented in NetworkX ``add_edge``
|
||||
"""
|
||||
# check if nodes already in Graph
|
||||
if parent not in graph:
|
||||
graph.add_node(parent)
|
||||
if child not in graph:
|
||||
graph.add_node(child)
|
||||
"""
|
||||
if weight_connection is None:
|
||||
weight_connection = 1
|
||||
# check if edge not in Graph
|
||||
@ -115,9 +106,7 @@ def update_graph(
|
||||
graph.add_edge(parent, child, weight=weight_connection)
|
||||
else:
|
||||
# update edge
|
||||
weight = graph[parent][child]['weight']
|
||||
weight += weight_connection
|
||||
graph[parent][child]['weight'] = weight
|
||||
graph[parent][child]['weight'] += weight_connection
|
||||
|
||||
|
||||
# build undirected adjacency matrix
|
||||
@ -249,7 +238,8 @@ def filter_graph_by_node_degree(
|
||||
bound_lower: int | None,
|
||||
bound_upper: int | None,
|
||||
) -> TokenGraph:
|
||||
"""filters all nodes which are within the provided bounds by their degree
|
||||
"""filters all nodes which are within the provided bounds by their degree,
|
||||
inclusive limits: bound_lower <= node_degree <= bound_upper are retained
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -266,13 +256,14 @@ def filter_graph_by_node_degree(
|
||||
# filter nodes by degree
|
||||
original_graph_nodes = copy.deepcopy(graph.nodes)
|
||||
filtered_graph = graph.copy()
|
||||
filtered_graph_degree = copy.deepcopy(filtered_graph.degree)
|
||||
|
||||
if not any([bound_lower, bound_upper]):
|
||||
logger.warning('No bounds provided, returning original graph.')
|
||||
return filtered_graph
|
||||
|
||||
for node in original_graph_nodes:
|
||||
degree = filtered_graph.degree[node] # type: ignore
|
||||
degree = cast(int, filtered_graph_degree[node]) # type: ignore
|
||||
if bound_lower is not None and degree < bound_lower:
|
||||
filtered_graph.remove_node(node)
|
||||
if bound_upper is not None and degree > bound_upper:
|
||||
@ -540,9 +531,9 @@ class TokenGraph(DiGraph):
|
||||
self._name = name
|
||||
# directed and undirected graph data
|
||||
self._directed = self
|
||||
self._metadata_directed: dict[str, int] = {}
|
||||
self._metadata_directed: dict[str, float] = {}
|
||||
self._undirected: Graph | None = None
|
||||
self._metadata_undirected: dict[str, int] = {}
|
||||
self._metadata_undirected: dict[str, float] = {}
|
||||
# indicate rescaled weights
|
||||
self.rescaled_weights: bool = False
|
||||
|
||||
@ -568,12 +559,12 @@ class TokenGraph(DiGraph):
|
||||
return hash(self.__key())
|
||||
"""
|
||||
|
||||
def copy(self) -> Self:
|
||||
def copy(self) -> TokenGraph:
|
||||
"""returns a (deep) copy of the graph
|
||||
|
||||
Returns
|
||||
-------
|
||||
Self
|
||||
TokenGraph
|
||||
deep copy of the graph
|
||||
"""
|
||||
return copy.deepcopy(self)
|
||||
@ -594,11 +585,11 @@ class TokenGraph(DiGraph):
|
||||
return self._undirected
|
||||
|
||||
@property
|
||||
def metadata_directed(self) -> dict[str, int]:
|
||||
def metadata_directed(self) -> dict[str, float]:
|
||||
return self._metadata_directed
|
||||
|
||||
@property
|
||||
def metadata_undirected(self) -> dict[str, int]:
|
||||
def metadata_undirected(self) -> dict[str, float]:
|
||||
return self._metadata_undirected
|
||||
|
||||
@overload
|
||||
|
||||
@ -30,7 +30,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
# following functions used to load a given dataset and perform simple
|
||||
# following functions are used to load a given dataset and perform simple
|
||||
# duplicate cleansing based on all properties
|
||||
def load_raw_data(
|
||||
path: Path,
|
||||
@ -277,41 +277,41 @@ def merge_similarity_dupl(
|
||||
|
||||
# ** #################################################################################
|
||||
# TODO check removal
|
||||
def build_embedding_map(
|
||||
data: Series,
|
||||
model: GermanSpacyModel | SentenceTransformer,
|
||||
) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
|
||||
# dictionary with embeddings
|
||||
embeddings: dict[int, tuple[Embedding, str]] = {}
|
||||
is_spacy = False
|
||||
is_STRF = False
|
||||
# def build_embedding_map(
|
||||
# data: Series,
|
||||
# model: GermanSpacyModel | SentenceTransformer,
|
||||
# ) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
|
||||
# # dictionary with embeddings
|
||||
# embeddings: dict[int, tuple[Embedding, str]] = {}
|
||||
# is_spacy = False
|
||||
# is_STRF = False
|
||||
|
||||
if isinstance(model, GermanSpacyModel):
|
||||
is_spacy = True
|
||||
elif isinstance(model, SentenceTransformer):
|
||||
is_STRF = True
|
||||
# if isinstance(model, GermanSpacyModel):
|
||||
# is_spacy = True
|
||||
# elif isinstance(model, SentenceTransformer):
|
||||
# is_STRF = True
|
||||
|
||||
if not any((is_spacy, is_STRF)):
|
||||
raise NotImplementedError('Model type unknown')
|
||||
# if not any((is_spacy, is_STRF)):
|
||||
# raise NotImplementedError('Model type unknown')
|
||||
|
||||
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
||||
# verbose code: Pyright not inferring types correctly
|
||||
idx = cast(int, idx)
|
||||
text = cast(str, text)
|
||||
if is_spacy:
|
||||
model = cast(GermanSpacyModel, model)
|
||||
embd = cast(SpacyDoc, model(text))
|
||||
embeddings[idx] = (embd, text)
|
||||
# check for empty vectors
|
||||
if not embd.vector_norm:
|
||||
logger.debug('--- Unknown Words ---')
|
||||
logger.debug('embd.text: %s has no vector', embd.text)
|
||||
elif is_STRF:
|
||||
model = cast(SentenceTransformer, model)
|
||||
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||
embeddings[idx] = (embd, text)
|
||||
# for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
||||
# # verbose code: Pyright not inferring types correctly
|
||||
# idx = cast(int, idx)
|
||||
# text = cast(str, text)
|
||||
# if is_spacy:
|
||||
# model = cast(GermanSpacyModel, model)
|
||||
# embd = cast(SpacyDoc, model(text))
|
||||
# embeddings[idx] = (embd, text)
|
||||
# # check for empty vectors
|
||||
# if not embd.vector_norm:
|
||||
# logger.debug('--- Unknown Words ---')
|
||||
# logger.debug('embd.text: %s has no vector', embd.text)
|
||||
# elif is_STRF:
|
||||
# model = cast(SentenceTransformer, model)
|
||||
# embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||
# embeddings[idx] = (embd, text)
|
||||
|
||||
return embeddings, (is_spacy, is_STRF)
|
||||
# return embeddings, (is_spacy, is_STRF)
|
||||
|
||||
|
||||
# adapt interface
|
||||
@ -320,276 +320,275 @@ def build_embedding_map(
|
||||
|
||||
|
||||
# build similarity matrix out of embeddings
|
||||
def build_cosSim_matrix(
|
||||
data: Series,
|
||||
model: GermanSpacyModel | SentenceTransformer,
|
||||
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
||||
# build empty matrix
|
||||
df_index = data.index
|
||||
cosineSim_idx_matrix = pd.DataFrame(
|
||||
data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
||||
)
|
||||
# def build_cosSim_matrix(
|
||||
# data: Series,
|
||||
# model: GermanSpacyModel | SentenceTransformer,
|
||||
# ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
||||
# # build empty matrix
|
||||
# df_index = data.index
|
||||
# cosineSim_idx_matrix = pd.DataFrame(
|
||||
# data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
||||
# )
|
||||
|
||||
logger.info('Start building embedding map...')
|
||||
# logger.info('Start building embedding map...')
|
||||
|
||||
# obtain embeddings based on used model
|
||||
embds, (is_spacy, is_STRF) = build_embedding_map(
|
||||
data=data,
|
||||
model=model,
|
||||
)
|
||||
# # obtain embeddings based on used model
|
||||
# embds, (is_spacy, is_STRF) = build_embedding_map(
|
||||
# data=data,
|
||||
# model=model,
|
||||
# )
|
||||
|
||||
logger.info('Embedding map built successfully.')
|
||||
# logger.info('Embedding map built successfully.')
|
||||
|
||||
# apply index based mapping for efficient handling of large texts
|
||||
combs = combinations(df_index, 2)
|
||||
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
||||
# # apply index based mapping for efficient handling of large texts
|
||||
# combs = combinations(df_index, 2)
|
||||
# total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
||||
|
||||
logger.info('Start calculation of similarity scores...')
|
||||
# logger.info('Start calculation of similarity scores...')
|
||||
|
||||
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
||||
# print(f"{idx1=}, {idx2=}")
|
||||
embd1 = embds[idx1][0]
|
||||
embd2 = embds[idx2][0]
|
||||
# for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
||||
# # print(f"{idx1=}, {idx2=}")
|
||||
# embd1 = embds[idx1][0]
|
||||
# embd2 = embds[idx2][0]
|
||||
|
||||
# calculate similarity based on model type
|
||||
if is_spacy:
|
||||
embd1 = cast(SpacyDoc, embds[idx1][0])
|
||||
embd2 = cast(SpacyDoc, embds[idx2][0])
|
||||
cosSim = embd1.similarity(embd2)
|
||||
elif is_STRF:
|
||||
embd1 = cast(Tensor, embds[idx1][0])
|
||||
embd2 = cast(Tensor, embds[idx2][0])
|
||||
cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
|
||||
cosSim = cast(float, cosSim.item())
|
||||
# # calculate similarity based on model type
|
||||
# if is_spacy:
|
||||
# embd1 = cast(SpacyDoc, embds[idx1][0])
|
||||
# embd2 = cast(SpacyDoc, embds[idx2][0])
|
||||
# cosSim = embd1.similarity(embd2)
|
||||
# elif is_STRF:
|
||||
# embd1 = cast(Tensor, embds[idx1][0])
|
||||
# embd2 = cast(Tensor, embds[idx2][0])
|
||||
# cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
|
||||
# cosSim = cast(float, cosSim.item())
|
||||
|
||||
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
||||
# cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
||||
|
||||
logger.info('Similarity scores calculated successfully.')
|
||||
# logger.info('Similarity scores calculated successfully.')
|
||||
|
||||
return cosineSim_idx_matrix, embds
|
||||
# return cosineSim_idx_matrix, embds
|
||||
|
||||
|
||||
# obtain index pairs with cosine similarity
|
||||
# greater than or equal to given threshold value
|
||||
def filt_thresh_cosSim_matrix(
|
||||
cosineSim_idx_matrix: DataFrame,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
threshold: float,
|
||||
) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
|
||||
"""filter similarity matrix by threshold value and return index pairs with
|
||||
a similarity score greater than the provided threshold
|
||||
# def filt_thresh_cosSim_matrix(
|
||||
# cosineSim_idx_matrix: DataFrame,
|
||||
# embds: dict[int, tuple[Embedding, str]],
|
||||
# threshold: float,
|
||||
# ) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
|
||||
# """filter similarity matrix by threshold value and return index pairs with
|
||||
# a similarity score greater than the provided threshold
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float
|
||||
similarity threshold
|
||||
cosineSim_idx_matrix : DataFrame
|
||||
similarity matrix
|
||||
# Parameters
|
||||
# ----------
|
||||
# threshold : float
|
||||
# similarity threshold
|
||||
# cosineSim_idx_matrix : DataFrame
|
||||
# similarity matrix
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
series with multi index (index pairs) and corresponding similarity score
|
||||
"""
|
||||
cosineSim_filt = cast(
|
||||
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
||||
)
|
||||
# Returns
|
||||
# -------
|
||||
# Series
|
||||
# series with multi index (index pairs) and corresponding similarity score
|
||||
# """
|
||||
# cosineSim_filt = cast(
|
||||
# Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
||||
# )
|
||||
|
||||
return cosineSim_filt, embds
|
||||
# return cosineSim_filt, embds
|
||||
|
||||
|
||||
def list_cosSim_dupl_candidates(
|
||||
cosineSim_filt: Series,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
save_candidates: bool = False,
|
||||
saving_path: Path | None = None,
|
||||
filename: str = 'CosSim-FilterCandidates',
|
||||
pipeline: Pipeline | None = None,
|
||||
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
given threshold; more suitable for debugging purposes
|
||||
# def list_cosSim_dupl_candidates(
|
||||
# cosineSim_filt: Series,
|
||||
# embds: dict[int, tuple[Embedding, str]],
|
||||
# save_candidates: bool = False,
|
||||
# saving_path: Path | None = None,
|
||||
# filename: str = 'CosSim-FilterCandidates',
|
||||
# pipeline: Pipeline | None = None,
|
||||
# ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
||||
# """providing an overview of candidates with a similarity score greater than
|
||||
# given threshold; more suitable for debugging purposes
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
contains indices, corresponding texts and similarity score to evaluate results
|
||||
list[tuple[Index, Index]]
|
||||
list containing relevant index pairs for entries with similarity score greater than
|
||||
given threshold
|
||||
"""
|
||||
logger.info('Start gathering of similarity candidates...')
|
||||
# compare found duplicates
|
||||
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
# Returns
|
||||
# -------
|
||||
# DataFrame
|
||||
# contains indices, corresponding texts and similarity score to evaluate results
|
||||
# list[tuple[Index, Index]]
|
||||
# list containing relevant index pairs for entries with similarity score greater than
|
||||
# given threshold
|
||||
# """
|
||||
# logger.info('Start gathering of similarity candidates...')
|
||||
# # compare found duplicates
|
||||
# columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
# df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
# index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
embds[idx1][1],
|
||||
idx2,
|
||||
embds[idx2][1],
|
||||
score,
|
||||
]
|
||||
]
|
||||
# add candidates to collection DataFrame
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
if df_candidates.empty:
|
||||
df_candidates = df_conc.copy()
|
||||
else:
|
||||
df_candidates = pd.concat([df_candidates, df_conc])
|
||||
# save index pairs
|
||||
index_pairs.append((idx1, idx2))
|
||||
# for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
# # get text content from embedding as second tuple entry
|
||||
# content = [
|
||||
# [
|
||||
# idx1,
|
||||
# embds[idx1][1],
|
||||
# idx2,
|
||||
# embds[idx2][1],
|
||||
# score,
|
||||
# ]
|
||||
# ]
|
||||
# # add candidates to collection DataFrame
|
||||
# df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
# if df_candidates.empty:
|
||||
# df_candidates = df_conc.copy()
|
||||
# else:
|
||||
# df_candidates = pd.concat([df_candidates, df_conc])
|
||||
# # save index pairs
|
||||
# index_pairs.append((idx1, idx2))
|
||||
|
||||
logger.info('Similarity candidates gathered successfully.')
|
||||
# logger.info('Similarity candidates gathered successfully.')
|
||||
|
||||
if save_candidates:
|
||||
if saving_path is None:
|
||||
raise ValueError(
|
||||
('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
||||
)
|
||||
elif pipeline is not None:
|
||||
target_filename = (
|
||||
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
||||
)
|
||||
elif pipeline is None:
|
||||
target_filename = f'{filename}.xlsx'
|
||||
logger.info('Saving similarity candidates...')
|
||||
target_path = saving_path.joinpath(target_filename)
|
||||
df_candidates.to_excel(target_path)
|
||||
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
|
||||
# if save_candidates:
|
||||
# if saving_path is None:
|
||||
# raise ValueError(
|
||||
# ('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
||||
# )
|
||||
# elif pipeline is not None:
|
||||
# target_filename = (
|
||||
# f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
||||
# )
|
||||
# elif pipeline is None:
|
||||
# target_filename = f'{filename}.xlsx'
|
||||
# logger.info('Saving similarity candidates...')
|
||||
# target_path = saving_path.joinpath(target_filename)
|
||||
# df_candidates.to_excel(target_path)
|
||||
# logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
|
||||
|
||||
return index_pairs, embds
|
||||
# return index_pairs, embds
|
||||
|
||||
|
||||
# TODO: change implementation fully to SentenceTransformer
|
||||
# usage of batch processing for embeddings, use candidate idx function
|
||||
# from time analysis --> moved to ``helpers.py``
|
||||
"""
|
||||
def similar_ids_connection_graph(
|
||||
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
) -> tuple[Graph, dict[str, int]]:
|
||||
# build index graph to obtain graph of connected (similar) indices
|
||||
# use this graph to get connected components (indices which belong together)
|
||||
# retain semantic connection on whole dataset
|
||||
similar_id_graph = nx.Graph()
|
||||
for (idx1, idx2) in similar_idx_pairs:
|
||||
# inplace operation, parent/child do not really exist in undirected graph
|
||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
|
||||
|
||||
return similar_id_graph, graph_info
|
||||
|
||||
def similar_ids_groups(
|
||||
dupl_id_graph: Graph,
|
||||
) -> Iterator[list[PandasIndex]]:
|
||||
# groups of connected indices
|
||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
nx.connected_components(G=dupl_id_graph))
|
||||
|
||||
for id_group in ids_groups:
|
||||
yield list(id_group)
|
||||
"""
|
||||
# def similar_ids_connection_graph(
|
||||
# similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
# ) -> tuple[Graph, dict[str, int]]:
|
||||
# # build index graph to obtain graph of connected (similar) indices
|
||||
# # use this graph to get connected components (indices which belong together)
|
||||
# # retain semantic connection on whole dataset
|
||||
# similar_id_graph = nx.Graph()
|
||||
# for (idx1, idx2) in similar_idx_pairs:
|
||||
# # inplace operation, parent/child do not really exist in undirected graph
|
||||
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
|
||||
# graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
|
||||
|
||||
# return similar_id_graph, graph_info
|
||||
|
||||
# def similar_ids_groups(
|
||||
# dupl_id_graph: Graph,
|
||||
# ) -> Iterator[list[PandasIndex]]:
|
||||
# # groups of connected indices
|
||||
# ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
# nx.connected_components(G=dupl_id_graph))
|
||||
|
||||
# for id_group in ids_groups:
|
||||
# yield list(id_group)
|
||||
|
||||
|
||||
# merge duplicates
|
||||
def merge_similarity_dupl_old(
|
||||
data: DataFrame,
|
||||
dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
) -> tuple[DataFrame]:
|
||||
# copy pre-cleaned data
|
||||
temp = data.copy()
|
||||
index = temp.index
|
||||
# logger.info("Start merging of similarity candidates...")
|
||||
# # merge duplicates
|
||||
# def merge_similarity_dupl_old(
|
||||
# data: DataFrame,
|
||||
# dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
# ) -> tuple[DataFrame]:
|
||||
# # copy pre-cleaned data
|
||||
# temp = data.copy()
|
||||
# index = temp.index
|
||||
# # logger.info("Start merging of similarity candidates...")
|
||||
|
||||
# iterate over index pairs
|
||||
for i1, i2 in tqdm(dupl_idx_pairs):
|
||||
# if an entry does not exist any more, skip this pair
|
||||
if i1 not in index or i2 not in index:
|
||||
continue
|
||||
# # iterate over index pairs
|
||||
# for i1, i2 in tqdm(dupl_idx_pairs):
|
||||
# # if an entry does not exist any more, skip this pair
|
||||
# if i1 not in index or i2 not in index:
|
||||
# continue
|
||||
|
||||
# merge num occur
|
||||
num_occur1 = temp.at[i1, 'num_occur']
|
||||
num_occur2 = temp.at[i2, 'num_occur']
|
||||
new_num_occur = num_occur1 + num_occur2
|
||||
# # merge num occur
|
||||
# num_occur1 = temp.at[i1, 'num_occur']
|
||||
# num_occur2 = temp.at[i2, 'num_occur']
|
||||
# new_num_occur = num_occur1 + num_occur2
|
||||
|
||||
# merge associated object ids
|
||||
assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
|
||||
assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
|
||||
new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
|
||||
new_assoc_ids = np.unique(new_assoc_ids.flatten())
|
||||
# # merge associated object ids
|
||||
# assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
|
||||
# assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
|
||||
# new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
|
||||
# new_assoc_ids = np.unique(new_assoc_ids.flatten())
|
||||
|
||||
# recalculate num associated obj ids
|
||||
new_num_assoc_obj_ids = len(new_assoc_ids)
|
||||
# # recalculate num associated obj ids
|
||||
# new_num_assoc_obj_ids = len(new_assoc_ids)
|
||||
|
||||
# write properties to first entry
|
||||
temp.at[i1, 'num_occur'] = new_num_occur
|
||||
temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
|
||||
temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
|
||||
# # write properties to first entry
|
||||
# temp.at[i1, 'num_occur'] = new_num_occur
|
||||
# temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
|
||||
# temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
|
||||
|
||||
# drop second entry
|
||||
temp = temp.drop(index=i2)
|
||||
index = temp.index
|
||||
# # drop second entry
|
||||
# temp = temp.drop(index=i2)
|
||||
# index = temp.index
|
||||
|
||||
# logger.info("Similarity candidates merged successfully.")
|
||||
# # logger.info("Similarity candidates merged successfully.")
|
||||
|
||||
return (temp,)
|
||||
# return (temp,)
|
||||
|
||||
|
||||
# ** debugging and evaluation
|
||||
def choose_cosSim_dupl_candidates(
|
||||
cosineSim_filt: Series,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
given threshold, but decision is made manually by iterating through the candidates
|
||||
with user interaction; more suitable for debugging purposes
|
||||
# def choose_cosSim_dupl_candidates(
|
||||
# cosineSim_filt: Series,
|
||||
# embds: dict[int, tuple[Embedding, str]],
|
||||
# ) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
|
||||
# """providing an overview of candidates with a similarity score greater than
|
||||
# given threshold, but decision is made manually by iterating through the candidates
|
||||
# with user interaction; more suitable for debugging purposes
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
contains indices, corresponding texts and similarity score to evaluate results
|
||||
list[tuple[Index, Index]]
|
||||
list containing relevant index pairs for entries with similarity score greater than
|
||||
given threshold
|
||||
"""
|
||||
# Returns
|
||||
# -------
|
||||
# DataFrame
|
||||
# contains indices, corresponding texts and similarity score to evaluate results
|
||||
# list[tuple[Index, Index]]
|
||||
# list containing relevant index pairs for entries with similarity score greater than
|
||||
# given threshold
|
||||
# """
|
||||
|
||||
# compare found duplicates
|
||||
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
# # compare found duplicates
|
||||
# columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
# df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
# index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
||||
# get texts for comparison
|
||||
text1 = embds[idx1][1]
|
||||
text2 = embds[idx2][1]
|
||||
# get decision
|
||||
print('---------- New Decision ----------')
|
||||
print('text1:\n', text1, '\n', flush=True)
|
||||
print('text2:\n', text2, '\n', flush=True)
|
||||
decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
|
||||
# for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
||||
# # get texts for comparison
|
||||
# text1 = embds[idx1][1]
|
||||
# text2 = embds[idx2][1]
|
||||
# # get decision
|
||||
# print('---------- New Decision ----------')
|
||||
# print('text1:\n', text1, '\n', flush=True)
|
||||
# print('text2:\n', text2, '\n', flush=True)
|
||||
# decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
|
||||
|
||||
if not decision == 'y':
|
||||
continue
|
||||
# if not decision == 'y':
|
||||
# continue
|
||||
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
text1,
|
||||
idx2,
|
||||
text2,
|
||||
score,
|
||||
]
|
||||
]
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
# # get text content from embedding as second tuple entry
|
||||
# content = [
|
||||
# [
|
||||
# idx1,
|
||||
# text1,
|
||||
# idx2,
|
||||
# text2,
|
||||
# score,
|
||||
# ]
|
||||
# ]
|
||||
# df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
|
||||
df_candidates = pd.concat([df_candidates, df_conc])
|
||||
index_pairs.append((idx1, idx2))
|
||||
# df_candidates = pd.concat([df_candidates, df_conc])
|
||||
# index_pairs.append((idx1, idx2))
|
||||
|
||||
return df_candidates, index_pairs
|
||||
# return df_candidates, index_pairs
|
||||
|
||||
@ -22,7 +22,7 @@ pattern_escape_newline = re.compile(r'[\n]+')
|
||||
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
|
||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)?([\d]{2,4})?')
|
||||
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@ def clean_string_slim(string: str) -> str:
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
string = pattern_escape_newline.sub('. ', string)
|
||||
# string = pattern_escape_newline.sub(' ', string)
|
||||
string = pattern_escape_seq.sub(' ', string)
|
||||
string = pattern_repeated_chars.sub('', string)
|
||||
# string = pattern_dates.sub('', string)
|
||||
@ -127,7 +127,7 @@ def candidates_by_index(
|
||||
|
||||
def similar_index_connection_graph(
|
||||
similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
|
||||
) -> tuple[Graph, dict[str, int]]:
|
||||
) -> tuple[Graph, dict[str, float]]:
|
||||
# build index graph to obtain graph of connected (similar) indices
|
||||
# use this graph to get connected components (indices which belong together)
|
||||
# retain semantic connection on whole dataset
|
||||
|
||||
17
src/lang_main/config.py
Normal file
17
src/lang_main/config.py
Normal file
@ -0,0 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import tomllib
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_toml_config(
|
||||
path_to_toml: str | Path,
|
||||
) -> dict[str, Any]:
|
||||
with open(path_to_toml, 'rb') as f:
|
||||
data = tomllib.load(f)
|
||||
print('Loaded TOML config file successfully.', file=sys.stderr, flush=True)
|
||||
return data
|
||||
@ -2,22 +2,21 @@ from enum import Enum # noqa: I001
|
||||
from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
import os
|
||||
|
||||
from sentence_transformers import SimilarityFunction
|
||||
|
||||
from lang_main import CONFIG, CYTO_PATH_STYLESHEET
|
||||
from lang_main import model_loader as m_load
|
||||
from lang_main import CONFIG, CYTO_PATH_STYLESHEET, BASE_PATH
|
||||
from lang_main.types import (
|
||||
CytoLayoutProperties,
|
||||
CytoLayouts,
|
||||
LanguageModels,
|
||||
ModelLoaderMap,
|
||||
ONNXExecutionProvider, # noqa: F401
|
||||
STFRBackends,
|
||||
STFRDeviceTypes,
|
||||
STFRModelArgs,
|
||||
STFRModels,
|
||||
STFRModelTypes,
|
||||
STFRQuantFilenames, # noqa: F401
|
||||
SpacyModelTypes,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@ -67,35 +66,29 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||
|
||||
# ** models
|
||||
# ** loading
|
||||
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
|
||||
STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
|
||||
MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models'
|
||||
MODEL_BASE_FOLDER: Final[Path] = BASE_PATH / MODEL_BASE_FOLDER_NAME
|
||||
if not MODEL_BASE_FOLDER.exists():
|
||||
raise FileNotFoundError('Language model folder not found.')
|
||||
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
|
||||
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_DEP_NEWS_TRF
|
||||
STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
|
||||
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
||||
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
|
||||
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
|
||||
STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
|
||||
# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
|
||||
# 'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
|
||||
# 'provider': ONNXExecutionProvider.CPU,
|
||||
# 'export': False,
|
||||
# }
|
||||
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
||||
LanguageModels.SENTENCE_TRANSFORMER: {
|
||||
'func': m_load.load_sentence_transformer,
|
||||
'kwargs': {
|
||||
'model_name': STFR_MODEL_NAME,
|
||||
'similarity_func': STFR_SIMILARITY,
|
||||
'backend': STFR_BACKEND,
|
||||
'device': STFR_DEVICE,
|
||||
'model_kwargs': STFR_MODEL_ARGS,
|
||||
},
|
||||
},
|
||||
LanguageModels.SPACY: {
|
||||
'func': m_load.load_spacy,
|
||||
'kwargs': {
|
||||
'model_name': SPACY_MODEL_NAME,
|
||||
},
|
||||
},
|
||||
STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
|
||||
STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
|
||||
'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
|
||||
'provider': ONNXExecutionProvider.CPU,
|
||||
'export': False,
|
||||
}
|
||||
stfr_model_args: STFRModelArgs
|
||||
if STFR_BACKEND == STFRBackends.ONNX:
|
||||
stfr_model_args = STFR_MODEL_ARGS_ONNX
|
||||
else:
|
||||
stfr_model_args = STFR_MODEL_ARGS_DEFAULT
|
||||
|
||||
STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
|
||||
# ** language dependency analysis
|
||||
# ** POS
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
# ** meta exceptions
|
||||
class LanguageModelNotFoundError(Exception):
|
||||
"""Error raised if a given language model could not be loaded successfully"""
|
||||
|
||||
|
||||
# ** token graph exceptions
|
||||
class EdgePropertyNotContainedError(Exception):
|
||||
"""Error raised if a needed edge property is not contained in graph edges"""
|
||||
|
||||
@ -21,8 +27,6 @@ class DependencyMissingError(Exception):
|
||||
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
"""Error describing that no action is available in the current pipeline"""
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import base64
|
||||
import pickle
|
||||
import shutil
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@ -33,15 +32,6 @@ def create_saving_folder(
|
||||
)
|
||||
|
||||
|
||||
def load_toml_config(
|
||||
path_to_toml: str | Path,
|
||||
) -> dict[str, Any]:
|
||||
with open(path_to_toml, 'rb') as f:
|
||||
data = tomllib.load(f)
|
||||
logger.info('Loaded TOML config file successfully.')
|
||||
return data
|
||||
|
||||
|
||||
# saving and loading using pickle
|
||||
# careful: pickling from unknown sources can be dangerous
|
||||
def save_pickle(
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
# lang_main: Config file
|
||||
[info]
|
||||
pkg = 'lang_main'
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
|
||||
@ -5,6 +5,7 @@ from time import gmtime
|
||||
from typing import Final
|
||||
|
||||
from lang_main.constants import (
|
||||
BASE_PATH,
|
||||
ENABLE_LOGGING,
|
||||
LOGGING_TO_FILE,
|
||||
LOGGING_TO_STDERR,
|
||||
@ -15,11 +16,11 @@ from lang_main.types import LoggingLevels
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
|
||||
# logging.basicConfig(
|
||||
# format=LOG_FMT,
|
||||
# datefmt=LOG_DATE_FMT,
|
||||
# )
|
||||
LOG_FILE_FOLDER: Final[Path] = BASE_PATH / 'logs'
|
||||
if not LOG_FILE_FOLDER.exists():
|
||||
LOG_FILE_FOLDER.mkdir(parents=True)
|
||||
|
||||
LOG_FILE_PATH: Final[Path] = LOG_FILE_FOLDER / 'lang-main.log'
|
||||
|
||||
# ** formatters
|
||||
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
|
||||
|
||||
@ -1,16 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Final,
|
||||
Literal,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import spacy
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sentence_transformers import SentenceTransformer, SimilarityFunction
|
||||
|
||||
from lang_main.constants import STFR_SIMILARITY
|
||||
from lang_main.constants import (
|
||||
SPACY_MODEL_NAME,
|
||||
STFR_BACKEND,
|
||||
STFR_DEVICE,
|
||||
STFR_MODEL_ARGS,
|
||||
STFR_MODEL_NAME,
|
||||
STFR_SIMILARITY,
|
||||
)
|
||||
from lang_main.errors import LanguageModelNotFoundError
|
||||
from lang_main.types import (
|
||||
LanguageModels,
|
||||
Model,
|
||||
@ -20,9 +29,6 @@ from lang_main.types import (
|
||||
STFRDeviceTypes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sentence_transformers import SimilarityFunction
|
||||
|
||||
|
||||
@overload
|
||||
def instantiate_model(
|
||||
@ -53,14 +59,27 @@ def instantiate_model(
|
||||
def load_spacy(
|
||||
model_name: str,
|
||||
) -> SpacyModel:
|
||||
return spacy.load(model_name)
|
||||
try:
|
||||
spacy_model_obj = importlib.import_module(SPACY_MODEL_NAME)
|
||||
except ModuleNotFoundError:
|
||||
raise LanguageModelNotFoundError(
|
||||
(
|
||||
f'Could not find spaCy model >>{model_name}<<. '
|
||||
f'Check if it is installed correctly.'
|
||||
)
|
||||
)
|
||||
pretrained_model = cast(SpacyModel, spacy_model_obj.load())
|
||||
|
||||
return pretrained_model
|
||||
|
||||
|
||||
def load_sentence_transformer(
|
||||
model_name: str,
|
||||
similarity_func: SimilarityFunction = STFR_SIMILARITY,
|
||||
similarity_func: SimilarityFunction = SimilarityFunction.COSINE,
|
||||
backend: STFRBackends = STFRBackends.TORCH,
|
||||
device: STFRDeviceTypes = STFRDeviceTypes.CPU,
|
||||
local_files_only: bool = False,
|
||||
model_save_folder: str | None = None,
|
||||
model_kwargs: dict[str, Any] | None = None,
|
||||
) -> SentenceTransformer:
|
||||
return SentenceTransformer(
|
||||
@ -68,5 +87,28 @@ def load_sentence_transformer(
|
||||
similarity_fn_name=similarity_func,
|
||||
backend=backend, # type: ignore Literal matches Enum
|
||||
device=device,
|
||||
cache_folder=model_save_folder,
|
||||
local_files_only=local_files_only,
|
||||
model_kwargs=model_kwargs,
|
||||
)
|
||||
|
||||
|
||||
# ** configured model builder functions
|
||||
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
||||
LanguageModels.SENTENCE_TRANSFORMER: {
|
||||
'func': load_sentence_transformer,
|
||||
'kwargs': {
|
||||
'model_name': STFR_MODEL_NAME,
|
||||
'similarity_func': STFR_SIMILARITY,
|
||||
'backend': STFR_BACKEND,
|
||||
'device': STFR_DEVICE,
|
||||
'model_kwargs': STFR_MODEL_ARGS,
|
||||
},
|
||||
},
|
||||
LanguageModels.SPACY: {
|
||||
'func': load_spacy,
|
||||
'kwargs': {
|
||||
'model_name': SPACY_MODEL_NAME,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@ -30,7 +30,6 @@ from lang_main.constants import (
|
||||
DATE_COLS,
|
||||
FEATURE_NAME_OBJ_ID,
|
||||
MODEL_INPUT_FEATURES,
|
||||
MODEL_LOADER_MAP,
|
||||
NAME_DELTA_FEAT_TO_REPAIR,
|
||||
SAVE_PATH_FOLDER,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
@ -41,6 +40,7 @@ from lang_main.constants import (
|
||||
THRESHOLD_UNIQUE_TEXTS,
|
||||
UNIQUE_CRITERION_FEATURE,
|
||||
)
|
||||
from lang_main.model_loader import MODEL_LOADER_MAP
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
from lang_main.types import EntryPoints, LanguageModels
|
||||
|
||||
|
||||
@ -45,13 +45,20 @@ class ONNXExecutionProvider(enum.StrEnum):
|
||||
CPU = 'CPUExecutionProvider'
|
||||
|
||||
|
||||
class STFRModels(enum.StrEnum):
|
||||
class STFRModelTypes(enum.StrEnum):
|
||||
ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
|
||||
ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
|
||||
ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
|
||||
ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
|
||||
|
||||
|
||||
class SpacyModelTypes(enum.StrEnum):
|
||||
DE_CORE_NEWS_SM = 'de_core_news_sm'
|
||||
DE_CORE_NEWS_MD = 'de_core_news_md'
|
||||
DE_CORE_NEWS_LG = 'de_core_news_lg'
|
||||
DE_DEP_NEWS_TRF = 'de_dep_news_trf'
|
||||
|
||||
|
||||
class STFRQuantFilenames(enum.StrEnum):
|
||||
ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
|
||||
|
||||
|
||||
1002
tests/Dummy_Dataset_N_1000.csv
Normal file
1002
tests/Dummy_Dataset_N_1000.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/analyse_dataset.xlsx
Normal file
BIN
tests/analyse_dataset.xlsx
Normal file
Binary file not shown.
0
tests/analysis/__init__.py
Normal file
0
tests/analysis/__init__.py
Normal file
168
tests/analysis/test_graphs.py
Normal file
168
tests/analysis/test_graphs.py
Normal file
@ -0,0 +1,168 @@
|
||||
import networkx as nx
|
||||
import pytest
|
||||
|
||||
from lang_main.analysis import graphs
|
||||
|
||||
TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
|
||||
|
||||
|
||||
def build_init_graph(token_graph: bool):
|
||||
edge_weights = [
|
||||
{'weight': 1},
|
||||
{'weight': 2},
|
||||
{'weight': 3},
|
||||
{'weight': 4},
|
||||
{'weight': 5},
|
||||
{'weight': 6},
|
||||
]
|
||||
edges = [
|
||||
(1, 2),
|
||||
(1, 3),
|
||||
(2, 4),
|
||||
(3, 4),
|
||||
(1, 4),
|
||||
(2, 1),
|
||||
]
|
||||
edges_to_add = []
|
||||
for i, edge in enumerate(edges):
|
||||
edge = list(edge)
|
||||
edge.append(edge_weights[i]) # type: ignore
|
||||
edges_to_add.append(tuple(edge))
|
||||
|
||||
if token_graph:
|
||||
G = graphs.TokenGraph(name=TK_GRAPH_NAME, enable_logging=False)
|
||||
else:
|
||||
G = nx.DiGraph()
|
||||
|
||||
G.add_edges_from(edges_to_add)
|
||||
|
||||
return G
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def graph():
|
||||
return build_init_graph(token_graph=False)
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def tk_graph():
|
||||
return build_init_graph(token_graph=True)
|
||||
|
||||
|
||||
def test_graph_size(graph):
|
||||
assert len(graph.nodes) == 4
|
||||
assert len(graph.edges) == 6
|
||||
|
||||
|
||||
def test_save_to_GraphML(graph, tmp_path):
|
||||
filename = 'test_graphML'
|
||||
graphs.save_to_GraphML(graph, saving_path=tmp_path, filename=filename)
|
||||
saved_file = (tmp_path / filename).with_suffix('.graphml')
|
||||
assert saved_file.exists()
|
||||
|
||||
|
||||
def test_metadata_retrieval(graph):
|
||||
metadata = graphs.get_graph_metadata(graph)
|
||||
assert metadata['num_nodes'] == 4
|
||||
assert metadata['num_edges'] == 6
|
||||
assert metadata['min_edge_weight'] == 1
|
||||
assert metadata['max_edge_weight'] == 6
|
||||
assert metadata['node_memory'] == 112
|
||||
assert metadata['edge_memory'] == 336
|
||||
assert metadata['total_memory'] == 448
|
||||
|
||||
|
||||
def test_graph_update_batch():
|
||||
graph_obj = build_init_graph(token_graph=False)
|
||||
graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
|
||||
metadata = graphs.get_graph_metadata(graph_obj)
|
||||
assert metadata['num_nodes'] == 6
|
||||
assert metadata['num_edges'] == 8
|
||||
assert metadata['min_edge_weight'] == 1
|
||||
assert metadata['max_edge_weight'] == 8
|
||||
|
||||
|
||||
def test_graph_update_single_new():
|
||||
graph_obj = build_init_graph(token_graph=False)
|
||||
graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
|
||||
metadata = graphs.get_graph_metadata(graph_obj)
|
||||
assert metadata['num_nodes'] == 5
|
||||
assert metadata['num_edges'] == 7
|
||||
assert metadata['min_edge_weight'] == 1
|
||||
assert metadata['max_edge_weight'] == 7
|
||||
|
||||
|
||||
def test_graph_update_single_existing():
|
||||
graph_obj = build_init_graph(token_graph=False)
|
||||
graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
|
||||
metadata = graphs.get_graph_metadata(graph_obj)
|
||||
assert metadata['num_nodes'] == 4
|
||||
assert metadata['num_edges'] == 6
|
||||
assert metadata['min_edge_weight'] == 1
|
||||
assert metadata['max_edge_weight'] == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cast_int', [True, False])
|
||||
def test_graph_undirected_conversion(graph, cast_int):
|
||||
graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
|
||||
# edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
|
||||
assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
|
||||
|
||||
|
||||
def test_graph_cytoscape_conversion(graph):
|
||||
cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
|
||||
node = cyto_graph[0]
|
||||
edge = cyto_graph[-1]
|
||||
assert node['data']['id'] == 1 # type: ignore
|
||||
assert edge['data']['source'] == 3 # type: ignore
|
||||
assert edge['data']['target'] == 4 # type: ignore
|
||||
assert edge['data']['weight'] == 4 # type: ignore
|
||||
assert weight_data['min'] == 1
|
||||
assert weight_data['max'] == 6
|
||||
|
||||
|
||||
def test_tk_graph_properties(tk_graph):
|
||||
assert tk_graph.name == TK_GRAPH_NAME
|
||||
assert isinstance(tk_graph.directed, graphs.TokenGraph)
|
||||
assert isinstance(tk_graph.undirected, nx.Graph)
|
||||
tk_graph.update_metadata()
|
||||
metadata_directed = tk_graph.metadata_directed
|
||||
assert metadata_directed['num_nodes'] == 4
|
||||
assert metadata_directed['num_edges'] == 6
|
||||
assert metadata_directed['min_edge_weight'] == 1
|
||||
assert metadata_directed['max_edge_weight'] == 6
|
||||
assert metadata_directed['node_memory'] == 112
|
||||
assert metadata_directed['edge_memory'] == 336
|
||||
assert metadata_directed['total_memory'] == 448
|
||||
metadata_undirected = tk_graph.metadata_undirected
|
||||
assert metadata_undirected['num_nodes'] == 4
|
||||
assert metadata_undirected['num_edges'] == 5
|
||||
assert metadata_undirected['min_edge_weight'] == 2
|
||||
assert metadata_undirected['max_edge_weight'] == 7
|
||||
assert metadata_undirected['node_memory'] == 112
|
||||
assert metadata_undirected['edge_memory'] == 280
|
||||
assert metadata_undirected['total_memory'] == 392
|
||||
|
||||
|
||||
def test_graph_degree_filter(tk_graph):
|
||||
filtered_graph = graphs.filter_graph_by_node_degree(
|
||||
tk_graph,
|
||||
bound_lower=3,
|
||||
bound_upper=3,
|
||||
)
|
||||
assert len(filtered_graph.nodes) == 2
|
||||
|
||||
|
||||
def test_graph_edge_number_filter(tk_graph):
|
||||
number_edges_limit = 1
|
||||
filtered_graph = graphs.filter_graph_by_number_edges(
|
||||
tk_graph,
|
||||
limit=number_edges_limit,
|
||||
)
|
||||
assert len(filtered_graph.edges) == number_edges_limit
|
||||
filtered_graph = graphs.filter_graph_by_node_degree(
|
||||
filtered_graph,
|
||||
bound_lower=1,
|
||||
bound_upper=None,
|
||||
)
|
||||
assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'
|
||||
73
tests/analysis/test_preprocessing.py
Normal file
73
tests/analysis/test_preprocessing.py
Normal file
@ -0,0 +1,73 @@
|
||||
"""testing each function in a consecutive way like each one is
|
||||
executed in in a pipeline
|
||||
"""
|
||||
|
||||
from lang_main.analysis import preprocessing as ppc
|
||||
from lang_main.analysis import shared
|
||||
|
||||
|
||||
def test_load_data(raw_data_path, raw_data_date_cols):
|
||||
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
|
||||
assert len(data) == 1000
|
||||
|
||||
|
||||
def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols):
|
||||
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
|
||||
(data,) = ppc.remove_duplicates(data)
|
||||
assert len(data) == 999
|
||||
|
||||
|
||||
def test_remove_na(raw_data_path, raw_data_date_cols):
|
||||
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
|
||||
(data,) = ppc.remove_duplicates(data)
|
||||
target_features: tuple[str] = ('VorgangsBeschreibung',)
|
||||
(data,) = ppc.remove_NA(data, target_features)
|
||||
assert len(data) == 998
|
||||
|
||||
|
||||
# def test_string_cleansing():
|
||||
# string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
|
||||
# cleaned_string = shared.clean_string_slim(string)
|
||||
# target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
|
||||
# assert cleaned_string == target_string
|
||||
|
||||
|
||||
def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols):
|
||||
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
|
||||
(data,) = ppc.remove_duplicates(data)
|
||||
target_features: tuple[str] = ('VorgangsBeschreibung',)
|
||||
(data,) = ppc.remove_NA(data, target_features)
|
||||
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
|
||||
cleaned_string = shared.clean_string_slim(starting_string)
|
||||
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
|
||||
assert cleaned_string == target_string
|
||||
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
|
||||
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
|
||||
(data,) = shared.entry_wise_cleansing(
|
||||
data,
|
||||
target_features=target_features,
|
||||
cleansing_func=shared.clean_string_slim,
|
||||
)
|
||||
assert data.at[0, 'VorgangsBeschreibung'] == target_string
|
||||
|
||||
|
||||
def test_analyse_feature(raw_data_path, raw_data_date_cols):
|
||||
(data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
|
||||
(data,) = ppc.remove_duplicates(data)
|
||||
target_features: tuple[str] = ('VorgangsBeschreibung',)
|
||||
(data,) = ppc.remove_NA(data, target_features)
|
||||
starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
|
||||
cleaned_string = shared.clean_string_slim(starting_string)
|
||||
target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
|
||||
assert cleaned_string == target_string
|
||||
starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!'
|
||||
assert data.at[0, 'VorgangsBeschreibung'] == starting_string
|
||||
(data,) = shared.entry_wise_cleansing(
|
||||
data,
|
||||
target_features=target_features,
|
||||
cleansing_func=shared.clean_string_slim,
|
||||
)
|
||||
assert data.at[0, 'VorgangsBeschreibung'] == target_string
|
||||
|
||||
(data,) = ppc.analyse_feature(data, target_feature=target_features[0])
|
||||
assert len(data) == 139
|
||||
23
tests/conftest.py
Normal file
23
tests/conftest.py
Normal file
@ -0,0 +1,23 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
DATE_COLS: tuple[str, ...] = (
|
||||
'VorgangsDatum',
|
||||
'ErledigungsDatum',
|
||||
'Arbeitsbeginn',
|
||||
'ErstellungsDatum',
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def raw_data_path():
|
||||
pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
|
||||
assert pth_data.exists()
|
||||
|
||||
return pth_data
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def raw_data_date_cols():
|
||||
return DATE_COLS
|
||||
@ -1,56 +0,0 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = '../scripts/inputs/'
|
||||
results = '../scripts/results/test_new2/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
0
tests/pipelines/__init__.py
Normal file
0
tests/pipelines/__init__.py
Normal file
7
tests/test_config.py
Normal file
7
tests/test_config.py
Normal file
@ -0,0 +1,7 @@
|
||||
from lang_main import config, pkg_dir
|
||||
|
||||
|
||||
def test_load_config():
|
||||
toml_path = pkg_dir / 'lang_main_config.toml'
|
||||
loaded_cfg = config.load_toml_config(toml_path)
|
||||
assert loaded_cfg['info']['pkg'] == 'lang_main'
|
||||
57
tests/test_io.py
Normal file
57
tests/test_io.py
Normal file
@ -0,0 +1,57 @@
|
||||
import pytest
|
||||
|
||||
from lang_main import io
|
||||
|
||||
CONTENT = 'test_lang_main'
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'overwrite',
|
||||
[True, False],
|
||||
)
|
||||
def test_create_saving_folder(tmp_path, overwrite):
|
||||
target_dir = tmp_path / 'test'
|
||||
assert not target_dir.exists()
|
||||
io.create_saving_folder(target_dir, overwrite_existing=overwrite)
|
||||
assert target_dir.exists()
|
||||
assert target_dir.is_dir()
|
||||
|
||||
|
||||
def test_save_load(tmp_path):
|
||||
save_pth = tmp_path / 'test_lang_main.pkl'
|
||||
io.save_pickle(CONTENT, save_pth)
|
||||
loaded = io.load_pickle(save_pth)
|
||||
assert loaded == CONTENT
|
||||
b64_str = io.encode_to_base64_str(CONTENT)
|
||||
b64_str_file = io.encode_file_to_base64_str(save_pth)
|
||||
assert b64_str == b64_str_file
|
||||
b64_decoded = io.decode_from_base64_str(b64_str)
|
||||
assert b64_decoded == CONTENT
|
||||
b64_decoded_file = io.decode_from_base64_str(b64_str_file)
|
||||
assert b64_decoded_file == CONTENT
|
||||
|
||||
|
||||
def test_get_entry_point(tmp_path):
|
||||
save_pth = tmp_path / 'test_lang_main.pkl'
|
||||
io.save_pickle(CONTENT, save_pth)
|
||||
pth = io.get_entry_point(
|
||||
tmp_path,
|
||||
'test_lang_main',
|
||||
'.pkl',
|
||||
check_existence=True,
|
||||
)
|
||||
assert pth.exists()
|
||||
with pytest.raises(FileNotFoundError):
|
||||
_ = io.get_entry_point(
|
||||
tmp_path,
|
||||
'test_lang_main2',
|
||||
'.pkl',
|
||||
check_existence=True,
|
||||
)
|
||||
pth = io.get_entry_point(
|
||||
tmp_path,
|
||||
'test_lang_main2',
|
||||
'.pkl',
|
||||
check_existence=False,
|
||||
)
|
||||
assert not pth.exists()
|
||||
5
tests/test_lang_main_init.py
Normal file
5
tests/test_lang_main_init.py
Normal file
@ -0,0 +1,5 @@
|
||||
from lang_main import BASE_PATH
|
||||
|
||||
|
||||
def test_base_path():
|
||||
assert BASE_PATH is not None
|
||||
113
tests/test_model_loader.py
Normal file
113
tests/test_model_loader.py
Normal file
@ -0,0 +1,113 @@
|
||||
import pytest
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from spacy.language import Language
|
||||
|
||||
from lang_main import model_loader
|
||||
from lang_main.constants import (
|
||||
STFR_MODEL_ARGS_ONNX,
|
||||
SimilarityFunction,
|
||||
SpacyModelTypes,
|
||||
STFRBackends,
|
||||
STFRDeviceTypes,
|
||||
STFRModelTypes,
|
||||
)
|
||||
from lang_main.types import LanguageModels
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'similarity_func',
|
||||
[
|
||||
SimilarityFunction.COSINE,
|
||||
SimilarityFunction.DOT,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
'model_name',
|
||||
[
|
||||
STFRModelTypes.ALL_DISTILROBERTA_V1,
|
||||
STFRModelTypes.ALL_MINI_LM_L12_V2,
|
||||
STFRModelTypes.ALL_MINI_LM_L6_V2,
|
||||
STFRModelTypes.ALL_MPNET_BASE_V2,
|
||||
],
|
||||
)
|
||||
@pytest.mark.mload
|
||||
def test_load_sentence_transformer(
|
||||
model_name,
|
||||
similarity_func,
|
||||
) -> None:
|
||||
model = model_loader.load_sentence_transformer(
|
||||
model_name=model_name,
|
||||
similarity_func=similarity_func,
|
||||
backend=STFRBackends.TORCH,
|
||||
device=STFRDeviceTypes.CPU,
|
||||
model_kwargs=None,
|
||||
)
|
||||
assert isinstance(model, SentenceTransformer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'similarity_func',
|
||||
[
|
||||
SimilarityFunction.COSINE,
|
||||
SimilarityFunction.DOT,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
'model_name',
|
||||
[
|
||||
STFRModelTypes.ALL_DISTILROBERTA_V1,
|
||||
STFRModelTypes.ALL_MINI_LM_L12_V2,
|
||||
STFRModelTypes.ALL_MINI_LM_L6_V2,
|
||||
STFRModelTypes.ALL_MPNET_BASE_V2,
|
||||
],
|
||||
)
|
||||
@pytest.mark.mload
|
||||
def test_load_sentence_transformer_onnx(
|
||||
model_name,
|
||||
similarity_func,
|
||||
) -> None:
|
||||
model = model_loader.load_sentence_transformer(
|
||||
model_name=model_name,
|
||||
similarity_func=similarity_func,
|
||||
backend=STFRBackends.ONNX,
|
||||
device=STFRDeviceTypes.CPU,
|
||||
model_kwargs=STFR_MODEL_ARGS_ONNX, # type: ignore
|
||||
)
|
||||
assert isinstance(model, SentenceTransformer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'model_name',
|
||||
[
|
||||
SpacyModelTypes.DE_CORE_NEWS_SM,
|
||||
SpacyModelTypes.DE_CORE_NEWS_MD,
|
||||
SpacyModelTypes.DE_CORE_NEWS_LG,
|
||||
SpacyModelTypes.DE_DEP_NEWS_TRF,
|
||||
],
|
||||
)
|
||||
@pytest.mark.mload
|
||||
def test_load_spacy_model(
|
||||
model_name,
|
||||
):
|
||||
model = model_loader.load_spacy(
|
||||
model_name=model_name,
|
||||
)
|
||||
assert isinstance(model, Language)
|
||||
|
||||
|
||||
@pytest.mark.mload
|
||||
def test_instantiate_spacy_model():
|
||||
model = model_loader.instantiate_model(
|
||||
model_load_map=model_loader.MODEL_LOADER_MAP,
|
||||
model=LanguageModels.SPACY,
|
||||
)
|
||||
assert isinstance(model, Language)
|
||||
|
||||
|
||||
@pytest.mark.mload
|
||||
def test_instantiate_stfr_model():
|
||||
model = model_loader.instantiate_model(
|
||||
model_load_map=model_loader.MODEL_LOADER_MAP,
|
||||
model=LanguageModels.SENTENCE_TRANSFORMER,
|
||||
)
|
||||
assert isinstance(model, SentenceTransformer)
|
||||
Loading…
x
Reference in New Issue
Block a user