started adding comprehensive unit tests
This commit is contained in:
@@ -1,58 +0,0 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './results/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
preprocessing_skip = true
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = false
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_number = 300
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
@@ -19,6 +19,900 @@
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import networkx as nx"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"edge_weighst = [\n",
|
||||
" {'weight': 1},\n",
|
||||
" {'weight': 2},\n",
|
||||
" {'weight': 3},\n",
|
||||
" {'weight': 4},\n",
|
||||
" {'weight': 5},\n",
|
||||
" {'weight': 6},\n",
|
||||
"]\n",
|
||||
"edges = [\n",
|
||||
" (1, 2),\n",
|
||||
" (1, 3),\n",
|
||||
" (2, 4),\n",
|
||||
" (3, 4),\n",
|
||||
" (1, 4),\n",
|
||||
" (2, 1),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"edges_to_add = []\n",
|
||||
"for i, edge in enumerate(edges):\n",
|
||||
" edge = list(edge)\n",
|
||||
" edge.append(edge_weighst[i])\n",
|
||||
" edges_to_add.append(tuple(edge))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 2, {'weight': 1}),\n",
|
||||
" (1, 3, {'weight': 2}),\n",
|
||||
" (2, 4, {'weight': 3}),\n",
|
||||
" (3, 4, {'weight': 4}),\n",
|
||||
" (1, 4, {'weight': 5}),\n",
|
||||
" (2, 1, {'weight': 6})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"edges_to_add"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G = nx.DiGraph()\n",
|
||||
"G.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "91d4094b-f886-4056-a697-5223f157f1d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tk = graphs.TokenGraph()\n",
|
||||
"tk.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "518cada9-561a-4b96-b750-3d500d1d28b9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main.analysis import graphs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'num_nodes': 4,\n",
|
||||
" 'num_edges': 6,\n",
|
||||
" 'min_edge_weight': 1,\n",
|
||||
" 'max_edge_weight': 6,\n",
|
||||
" 'node_memory': 112,\n",
|
||||
" 'edge_memory': 336,\n",
|
||||
" 'total_memory': 448}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graphs.get_graph_metadata(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ca2ce8e8-d72a-4edf-ae42-0f79bd9d19a2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "223dc592-fa56-4536-a5c2-a166001a6aca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>6.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 1 2 3 4\n",
|
||||
"1 0.0 1.0 2.0 5.0\n",
|
||||
"2 6.0 0.0 0.0 3.0\n",
|
||||
"3 0.0 0.0 0.0 4.0\n",
|
||||
"4 0.0 0.0 0.0 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nx.to_pandas_adjacency(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "1f677da0-7416-413c-adb1-ae1384e09349",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G_undir = graphs.convert_graph_to_undirected(G, cast_int=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "356862fb-2383-43d9-80ba-4fe83646c9d9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>7.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 1 2 3 4\n",
|
||||
"1 0.0 7.0 2.0 5.0\n",
|
||||
"2 7.0 0.0 0.0 3.0\n",
|
||||
"3 2.0 0.0 0.0 4.0\n",
|
||||
"4 5.0 3.0 4.0 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nx.to_pandas_adjacency(G_undir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "b8a3db1a-0d2a-4635-ab88-7802e2cf59e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G_undir.is_directed()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "46001528-75b0-4fe8-a3ec-353bbd3eeeff",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'weight': 7.0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G_undir[1][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "cf2dcdff-f0b7-416e-9db3-c7a21ea96b96",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"([{'data': {'id': 1, 'label': 1}},\n",
|
||||
" {'data': {'id': 2, 'label': 2}},\n",
|
||||
" {'data': {'id': 3, 'label': 3}},\n",
|
||||
" {'data': {'id': 4, 'label': 4}},\n",
|
||||
" {'data': {'source': 1, 'target': 2, 'weight': 1}},\n",
|
||||
" {'data': {'source': 1, 'target': 3, 'weight': 2}},\n",
|
||||
" {'data': {'source': 1, 'target': 4, 'weight': 5}},\n",
|
||||
" {'data': {'source': 2, 'target': 4, 'weight': 3}},\n",
|
||||
" {'data': {'source': 2, 'target': 1, 'weight': 6}},\n",
|
||||
" {'data': {'source': 3, 'target': 4, 'weight': 4}}],\n",
|
||||
" {'min': 1, 'max': 6})"
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graphs.convert_graph_to_cytoscape(G)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f82481e9-873f-4657-80d3-ba75af74fa27",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"TokenGraph(name: TokenGraph, number of nodes: 4, number of edges: 6)"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tk.update_metadata()\n",
|
||||
"tk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"id": "4b806620-b469-45ef-823b-db46f8590509",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(G.degree)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"id": "2a41d019-1b6b-46f7-b13e-ac22da737940",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4"
|
||||
]
|
||||
},
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G.degree[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e99f2fb4-4c8d-4564-810d-a4b2ed9d6009",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(1, 4), (2, 3), (3, 2), (4, 3)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(tk.degree)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "1368ebf6-e008-492d-8d15-fe3ed12b78a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"g_filt = graphs.filter_graph_by_node_degree(\n",
|
||||
" tk,\n",
|
||||
" bound_lower=3,\n",
|
||||
" bound_upper=3,\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "de69f73e-da1d-4479-81da-006f2ce61844",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"TokenGraph(name: TokenGraph, number of nodes: 2, number of edges: 1)"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"g_filt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "b12fd64d-737e-4c68-94ea-72a817647a04",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[2, 4]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(g_filt.nodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "21434c7c-887c-4f9f-884a-48514e2279e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"G = nx.DiGraph()\n",
|
||||
"G.add_edges_from(edges_to_add)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "2defef69-f09a-4869-984a-27b6373b17b9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'weight': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"G[1][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "0308a2ac-f554-4e24-9ddb-578dd588f3c8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"5"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(G.edges)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15d9ce65-f9a5-40de-a737-098579f6a8ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7acf4be7-45f3-45e6-87f5-14343f23d610",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9139812b-74ba-45ce-adfc-e57667259692",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded TOML config file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lang_main import search_iterative, search_base_path\n",
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "35118922-3a17-4698-93bc-5292a276a4b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main import constants"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "341299bf-e926-4e55-8545-8805a186f49c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"WindowsPath('A:/Arbeitsaufgaben/lang-models')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"constants.MODEL_BASE_FOLDER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "11ce4062-b229-4d88-967d-6eeb6d0135b7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sentence_transformers import SentenceTransformer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "40dac543-1e53-4fd8-a192-88f3527872b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_kwargs = {\n",
|
||||
" 'file_name': 'onnx/model_quint8_avx2.onnx',\n",
|
||||
" 'provider': 'CPUExecutionProvider',\n",
|
||||
" 'export': False,\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "8a0eaa9f-e2d2-4106-b80b-80916e9d8bfe",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "215b46e3607e4530b2d8f8227367ef23",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"model_quint8_avx2.onnx: 0%| | 0.00/23.0M [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in A:\\Arbeitsaufgaben\\lang-models\\models--sentence-transformers--all-MiniLM-L6-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
||||
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
||||
" warnings.warn(message)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stfr = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine', backend='onnx', model_kwargs=model_kwargs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "cd921aca-0673-41ec-98a3-18e360a39a41",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from lang_main.constants import SPACY_MODEL_NAME"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "88173a68-7d8e-4f4c-a4ad-bbf78efaf781",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import importlib"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e5293976-22ab-406a-ba32-066fd7254394",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mod = importlib.import_module(SPACY_MODEL_NAME)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6023a339-02da-429c-acf5-f14a56989357",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<module 'de_dep_news_trf' from 'A:\\\\Arbeitsaufgaben\\\\lang-main\\\\.venv\\\\Lib\\\\site-packages\\\\de_dep_news_trf\\\\__init__.py'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mod"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5f4fa066-fa0f-4818-9cf9-ec28923150ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded TOML config file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from lang_main.analysis.shared import clean_string_slim"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "71286836-7eb2-4095-ab82-42d7ac7ed476",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"string = 'Ölleckage durch\\nundichten \\t Ölsumpf,, aber Dichtung intakt??!!!'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e8284e76-e750-458e-bb63-d59d6d57a396",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ölleckage durch\n",
|
||||
"undichten \t Ölsumpf,, aber Dichtung intakt??!!!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(string)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "82e98d8f-2e24-42f9-a3ed-3b3454ae64f4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clean_string_slim(string)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b527b145-15d2-4961-b441-1843fe9f5c29",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "49c2e2f0-1e6d-4969-b583-8fc15b8930f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "16ae5d5c-a0a7-400b-8e38-231c72ad27b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "3b9fe636-f895-404a-819d-61198d34262d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Am war ich essen. Am hingegen nicht. Und war ich allein.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"string = 'Am 11.02.2024 war ich essen. Am 11.12. hingegen nicht. Und 2024 war ich allein.'\n",
|
||||
"string = pattern_dates.sub('', string)\n",
|
||||
"string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c49ab3c-e860-42af-ac0c-2f44f075e846",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
|
||||
Reference in New Issue
Block a user