{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "147e39b9-0066-4cca-9561-8ed0c994850c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f095f9ff-f7c0-4446-97cb-c208a1ae62c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'A:\\\\Arbeitsaufgaben\\\\Instandhaltung'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"str_path = os.getcwd()\n",
"str_path"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e9610b62-667c-4322-b936-bee6d45c17cf",
"metadata": {},
"outputs": [],
"source": [
"p = Path(str_path)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "b1258614-d8e9-4205-992d-b16a5406f049",
"metadata": {},
"outputs": [],
"source": [
"folder = list((p / 'results' / 'test_new2').glob('*'))"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "a4cec1df-cc16-481b-9e3c-f12747283bd8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step_3_CosSim-FilterCandidates.xlsx'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl')]"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"folder"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "510b2262-edab-4874-878d-f736a6076e79",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f0841940-2285-4bc6-bc08-8a04844d7fd3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step_3_CosSim-FilterCandidates.xlsx'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/test_new2/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"from pathlib import Path\n",
"\n",
"import networkx as nx\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from ihm_analyse import load_pickle\n",
"from ihm_analyse.lib.preprocess import merge_similarity_dupl\n",
"from ihm_analyse.lib.graphs import update_graph, get_graph_metadata\n",
"\n",
"\n",
"str_path = os.getcwd()\n",
"p = Path(str_path)\n",
"folder = list((p / 'results' / 'test_new2').glob('*'))\n",
"folder"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8e51a545-228a-4f51-8440-53db05551d69",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"# dataset\n",
"res = load_pickle(folder[1])\n",
"data = res[0]\n",
"# dupl IDs\n",
"res = load_pickle(folder[-2])\n",
"dupl_ids = res[0]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b95631d0-018a-4a0d-9d94-dec5db33dff4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9331"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dupl_ids)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "e1e2a149-a1d4-47c7-a9fa-a96db09f7144",
"metadata": {},
"outputs": [],
"source": [
"sub_ids = dupl_ids.copy()\n",
"sub_ids = dupl_ids[:20]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "35097dd4-e19a-4478-abe2-74135fec9fdc",
"metadata": {},
"outputs": [],
"source": [
"# build index graph to obtain graph of connected (similar) indices\n",
"# use this graph to obtain connected components (indices which belong together)\n",
"# retain semantic connection on whole dataset\n",
"dupl_id_graph = nx.Graph()\n",
"\n",
"for (idx1, idx2) in sub_ids:\n",
" # inplace operation, parent/child do not really exist in undirected graph\n",
" update_graph(graph=dupl_id_graph, parent=idx1, child=idx2)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "0a6c6e61-91da-4f67-a3a5-b20072a8c1f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.graphs:Graph properties: 24 Nodes, 20 Edges\n",
"INFO:ihm_analyse.graphs:Node memory: 0.66 KB\n",
"INFO:ihm_analyse.graphs:Edge memory: 1.09 KB\n",
"INFO:ihm_analyse.graphs:Total memory: 1.75 KB\n"
]
}
],
"source": [
"graph_meta = get_graph_metadata(graph=dupl_id_graph)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "92e9ba06-428b-412d-90e8-1f161c93b681",
"metadata": {},
"outputs": [],
"source": [
"conn_ids = nx.connected_components(dupl_id_graph)\n",
"conn_ids_tpl = tuple(conn_ids)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "379f983e-776e-4520-9753-61adbeac968c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{33, 487, 5703, 176, 247, 345, 157}\n",
"{882, 131}\n",
"{561, 332, 558}\n",
"{104, 4003}\n",
"{5298, 132}\n",
"{34, 3121, 3122, 3123, 63}\n",
"{168, 6378, 1068}\n"
]
}
],
"source": [
"for id_set in conn_ids:\n",
" print(id_set)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "5edec07e-9618-4650-b806-e49de3301262",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{33, 157, 176, 247, 345, 487, 5703},\n",
" {131, 882},\n",
" {332, 558, 561},\n",
" {104, 4003},\n",
" {132, 5298},\n",
" {34, 63, 3121, 3122, 3123},\n",
" {168, 1068, 6378}]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conn_ids_lst"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "f37dfc52-2734-45ff-8939-03eb47465d41",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 162 | \n",
" Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n",
" 66 | \n",
" 92592 | \n",
" [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n",
" 206 | \n",
"
\n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 1654 | \n",
" [301, 304, 305, 313, 314, 331, 332, 510, 511, ... | \n",
" 18 | \n",
"
\n",
" \n",
" | 131 | \n",
" Tägliche Überprüfung der Ölabscheider | \n",
" 37 | \n",
" 1616 | \n",
" [0, 970, 2134, 2137] | \n",
" 4 | \n",
"
\n",
" \n",
" | 160 | \n",
" Wöchentliche Kontrolle der WC-Anlagen | \n",
" 37 | \n",
" 1265 | \n",
" [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n",
" 11 | \n",
"
\n",
" \n",
" | 140 | \n",
" Halbjährliche Kontrolle des Stabbreithalters | \n",
" 44 | \n",
" 687 | \n",
" [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n",
" 166 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2679 | \n",
" Zahnräder der Laufkatze verschlissen Ersatztei... | \n",
" 170 | \n",
" 1 | \n",
" [415] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2678 | \n",
" Bitte 8 Scheiben nach Muster anfertigen. Danke. | \n",
" 48 | \n",
" 1 | \n",
" [140] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2677 | \n",
" Schalter für Bühne Schwenken abgerissen, bitte... | \n",
" 126 | \n",
" 1 | \n",
" [323] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2676 | \n",
" Docke angefahren! | \n",
" 17 | \n",
" 1 | \n",
" [176] | \n",
" 1 | \n",
"
\n",
" \n",
" | 6799 | \n",
" Befestigung Deckel für Batteriefach defekt ... | \n",
" 107 | \n",
" 1 | \n",
" [326] | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
6800 rows × 5 columns
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "5da4d722-fae2-4835-821c-70ed348bb71a",
"metadata": {},
"outputs": [],
"source": [
"test_ids = list(conn_ids_tpl[0])"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "9eef53d4-2b0f-40cb-b4c3-6a2992d7ec09",
"metadata": {},
"outputs": [],
"source": [
"sub_data = data.loc[test_ids,:].copy()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "42aa3a0a-85ff-40a1-8473-6b927ade9fe6",
"metadata": {},
"outputs": [],
"source": [
"# obtain bunch\n",
"# filter for bunch\n",
"# merge bunch\n",
"# remove all but merged entry from whole dataset"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "684eeaf7-f86b-4159-bca9-a345601a2d2b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 1654 | \n",
" [301, 304, 305, 313, 314, 331, 332, 510, 511, ... | \n",
" 18 | \n",
"
\n",
" \n",
" | 157 | \n",
" Monatliche Sichtkontrolle | \n",
" 25 | \n",
" 634 | \n",
" [1038, 1040, 1041, 1042, 1043, 1044, 1045, 121... | \n",
" 24 | \n",
"
\n",
" \n",
" | 176 | \n",
" Wöchentliche Sichtprüfung / Reinigung | \n",
" 37 | \n",
" 361 | \n",
" [301, 304, 305, 313, 314, 323, 329, 421, 1003,... | \n",
" 11 | \n",
"
\n",
" \n",
" | 247 | \n",
" Monatliche Sichtkontrolle / Reinigung | \n",
" 37 | \n",
" 113 | \n",
" [899, 906, 1052, 1169, 1170, 1725] | \n",
" 6 | \n",
"
\n",
" \n",
" | 487 | \n",
" Wöchentliche Sichtprüfung | \n",
" 25 | \n",
" 35 | \n",
" [1666] | \n",
" 1 | \n",
"
\n",
" \n",
" | 345 | \n",
" Monatliche Sichtprüfung / Reinigung | \n",
" 35 | \n",
" 33 | \n",
" [885, 899, 906, 945, 946, 970, 1052, 1169, 1170] | \n",
" 9 | \n",
"
\n",
" \n",
" | 5703 | \n",
" monatliche Sichtkontrolle | \n",
" 25 | \n",
" 1 | \n",
" [1725] | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"157 Monatliche Sichtkontrolle 25 634 \n",
"176 Wöchentliche Sichtprüfung / Reinigung 37 361 \n",
"247 Monatliche Sichtkontrolle / Reinigung 37 113 \n",
"487 Wöchentliche Sichtprüfung 25 35 \n",
"345 Monatliche Sichtprüfung / Reinigung 35 33 \n",
"5703 monatliche Sichtkontrolle 25 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"157 [1038, 1040, 1041, 1042, 1043, 1044, 1045, 121... 24 \n",
"176 [301, 304, 305, 313, 314, 323, 329, 421, 1003,... 11 \n",
"247 [899, 906, 1052, 1169, 1170, 1725] 6 \n",
"487 [1666] 1 \n",
"345 [885, 899, 906, 945, 946, 970, 1052, 1169, 1170] 9 \n",
"5703 [1725] 1 "
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub_data = sub_data.sort_values(by=['num_occur', 'num_assoc_obj_ids', 'len'], ascending=[False, False, False])\n",
"sub_data"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "5e048cc8-ca58-48cc-8b6f-57a0fbe88485",
"metadata": {},
"outputs": [],
"source": [
"# keep first entry with max number occurrences, then number of oassociated objects,\n",
"# then length of entry\n",
"data_idx = sub_data.index[0]\n",
"#entry = sub_data.iat[0,0]\n",
"#sub_data.at[data_idx, 'len'] = len(entry)\n",
"sub_data.at[data_idx, 'num_occur'] = sub_data['num_occur'].sum()\n",
"# assoc IDs\n",
"assoc_obj_ids = sub_data['assoc_obj_ids'].to_numpy()\n",
"assoc_obj_ids = np.concatenate(assoc_obj_ids)\n",
"assoc_obj_ids = np.unique(assoc_obj_ids)\n",
"sub_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids\n",
"sub_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_ids_uni)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "0fac40c5-102b-40d8-a45b-1c0bd89bb672",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 2831 | \n",
" [301, 304, 305, 313, 314, 323, 329, 331, 332, ... | \n",
" 54 | \n",
"
\n",
" \n",
" | 157 | \n",
" Monatliche Sichtkontrolle | \n",
" 25 | \n",
" 634 | \n",
" [1038, 1040, 1041, 1042, 1043, 1044, 1045, 121... | \n",
" 24 | \n",
"
\n",
" \n",
" | 176 | \n",
" Wöchentliche Sichtprüfung / Reinigung | \n",
" 37 | \n",
" 361 | \n",
" [301, 304, 305, 313, 314, 323, 329, 421, 1003,... | \n",
" 11 | \n",
"
\n",
" \n",
" | 247 | \n",
" Monatliche Sichtkontrolle / Reinigung | \n",
" 37 | \n",
" 113 | \n",
" [899, 906, 1052, 1169, 1170, 1725] | \n",
" 6 | \n",
"
\n",
" \n",
" | 487 | \n",
" Wöchentliche Sichtprüfung | \n",
" 25 | \n",
" 35 | \n",
" [1666] | \n",
" 1 | \n",
"
\n",
" \n",
" | 345 | \n",
" Monatliche Sichtprüfung / Reinigung | \n",
" 35 | \n",
" 33 | \n",
" [885, 899, 906, 945, 946, 970, 1052, 1169, 1170] | \n",
" 9 | \n",
"
\n",
" \n",
" | 5703 | \n",
" monatliche Sichtkontrolle | \n",
" 25 | \n",
" 1 | \n",
" [1725] | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2831 \n",
"157 Monatliche Sichtkontrolle 25 634 \n",
"176 Wöchentliche Sichtprüfung / Reinigung 37 361 \n",
"247 Monatliche Sichtkontrolle / Reinigung 37 113 \n",
"487 Wöchentliche Sichtprüfung 25 35 \n",
"345 Monatliche Sichtprüfung / Reinigung 35 33 \n",
"5703 monatliche Sichtkontrolle 25 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 54 \n",
"157 [1038, 1040, 1041, 1042, 1043, 1044, 1045, 121... 24 \n",
"176 [301, 304, 305, 313, 314, 323, 329, 421, 1003,... 11 \n",
"247 [899, 906, 1052, 1169, 1170, 1725] 6 \n",
"487 [1666] 1 \n",
"345 [885, 899, 906, 945, 946, 970, 1052, 1169, 1170] 9 \n",
"5703 [1725] 1 "
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub_data"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "de617dbf-0c96-4702-aa5f-c72af2b004e8",
"metadata": {},
"outputs": [],
"source": [
"test_ids.remove(data_idx)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "1279561e-c46d-48a6-a679-dcbcb7c72761",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[487, 5703, 176, 247, 345, 157]"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_ids"
]
},
{
"cell_type": "code",
"execution_count": 123,
"id": "acfffecf-2576-4979-8b19-6bd31d0e0d64",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 2831 | \n",
" [301, 304, 305, 313, 314, 323, 329, 331, 332, ... | \n",
" 54 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2831 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 54 "
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub_data2 = sub_data.drop(index=test_ids)\n",
"sub_data2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb240df2-044c-44b9-8d4e-c5fd0d157c07",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab927a8c-fed3-42f2-a15d-9403184b1f8c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(33, 176), (33, 247), (33, 487), (131, 882), (332, 558)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_ids = dupl_ids[:5]\n",
"test_ids"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "adc463b7-9ea2-48e1-84e4-972ef45b5f9b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.graphs:Graph properties: 2695 Nodes, 9331 Edges\n",
"INFO:ihm_analyse.graphs:Node memory: 73.69 KB\n",
"INFO:ihm_analyse.graphs:Edge memory: 510.29 KB\n",
"INFO:ihm_analyse.graphs:Total memory: 583.98 KB\n"
]
}
],
"source": [
"ret = merge_similarity_dupl_test(data=data, dupl_idx_pairs=dupl_ids)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f2d91678-ea68-49e7-91c1-1cbc8a4fe0cc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 162 | \n",
" Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n",
" 66 | \n",
" 92592 | \n",
" [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n",
" 206 | \n",
"
\n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 3111 | \n",
" [301, 304, 305, 313, 314, 323, 329, 331, 332, ... | \n",
" 74 | \n",
"
\n",
" \n",
" | 131 | \n",
" Tägliche Überprüfung der Ölabscheider | \n",
" 37 | \n",
" 1619 | \n",
" [0, 970, 2134, 2137] | \n",
" 4 | \n",
"
\n",
" \n",
" | 160 | \n",
" Wöchentliche Kontrolle der WC-Anlagen | \n",
" 37 | \n",
" 1265 | \n",
" [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n",
" 11 | \n",
"
\n",
" \n",
" | 140 | \n",
" Halbjährliche Kontrolle des Stabbreithalters | \n",
" 44 | \n",
" 687 | \n",
" [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n",
" 166 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2680 | \n",
" Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... | \n",
" 260 | \n",
" 1 | \n",
" [311] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2679 | \n",
" Zahnräder der Laufkatze verschlissen Ersatztei... | \n",
" 170 | \n",
" 1 | \n",
" [415] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2677 | \n",
" Schalter für Bühne Schwenken abgerissen, bitte... | \n",
" 126 | \n",
" 1 | \n",
" [323] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2676 | \n",
" Docke angefahren! | \n",
" 17 | \n",
" 1 | \n",
" [176] | \n",
" 1 | \n",
"
\n",
" \n",
" | 6799 | \n",
" Befestigung Deckel für Batteriefach defekt ... | \n",
" 107 | \n",
" 2 | \n",
" [306, 326] | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
4582 rows × 5 columns
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 3111 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 2 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 74 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2680 [311] 1 \n",
"2679 [415] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [306, 326] 2 \n",
"\n",
"[4582 rows x 5 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cb13e547-5107-4f7b-a92d-ea52e7ce2fd4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 162 | \n",
" Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n",
" 66 | \n",
" 92592 | \n",
" [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n",
" 206 | \n",
"
\n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 1654 | \n",
" [301, 304, 305, 313, 314, 331, 332, 510, 511, ... | \n",
" 18 | \n",
"
\n",
" \n",
" | 131 | \n",
" Tägliche Überprüfung der Ölabscheider | \n",
" 37 | \n",
" 1616 | \n",
" [0, 970, 2134, 2137] | \n",
" 4 | \n",
"
\n",
" \n",
" | 160 | \n",
" Wöchentliche Kontrolle der WC-Anlagen | \n",
" 37 | \n",
" 1265 | \n",
" [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n",
" 11 | \n",
"
\n",
" \n",
" | 140 | \n",
" Halbjährliche Kontrolle des Stabbreithalters | \n",
" 44 | \n",
" 687 | \n",
" [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n",
" 166 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2679 | \n",
" Zahnräder der Laufkatze verschlissen Ersatztei... | \n",
" 170 | \n",
" 1 | \n",
" [415] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2678 | \n",
" Bitte 8 Scheiben nach Muster anfertigen. Danke. | \n",
" 48 | \n",
" 1 | \n",
" [140] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2677 | \n",
" Schalter für Bühne Schwenken abgerissen, bitte... | \n",
" 126 | \n",
" 1 | \n",
" [323] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2676 | \n",
" Docke angefahren! | \n",
" 17 | \n",
" 1 | \n",
" [176] | \n",
" 1 | \n",
"
\n",
" \n",
" | 6799 | \n",
" Befestigung Deckel für Batteriefach defekt ... | \n",
" 107 | \n",
" 1 | \n",
" [326] | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
6800 rows × 5 columns
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "markdown",
"id": "e5f16846-f92e-4a85-8cba-830e34705837",
"metadata": {},
"source": [
"## New Merge Duplicates in Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ed62a563-886f-4269-ab27-237ff39ea0da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"# dataset\n",
"res = load_pickle(folder[-1])\n",
"data = res[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e82810d-8cda-439d-ae26-4e65bad351d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entry | \n",
" len | \n",
" num_occur | \n",
" assoc_obj_ids | \n",
" num_assoc_obj_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 162 | \n",
" Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n",
" 66 | \n",
" 92592 | \n",
" [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n",
" 206 | \n",
"
\n",
" \n",
" | 33 | \n",
" Wöchentliche Sichtkontrolle / Reinigung | \n",
" 39 | \n",
" 3111 | \n",
" [301, 304, 305, 313, 314, 323, 329, 331, 332, ... | \n",
" 74 | \n",
"
\n",
" \n",
" | 131 | \n",
" Tägliche Überprüfung der Ölabscheider | \n",
" 37 | \n",
" 1619 | \n",
" [0, 970, 2134, 2137] | \n",
" 4 | \n",
"
\n",
" \n",
" | 160 | \n",
" Wöchentliche Kontrolle der WC-Anlagen | \n",
" 37 | \n",
" 1265 | \n",
" [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n",
" 11 | \n",
"
\n",
" \n",
" | 140 | \n",
" Halbjährliche Kontrolle des Stabbreithalters | \n",
" 44 | \n",
" 687 | \n",
" [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n",
" 166 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 2680 | \n",
" Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... | \n",
" 260 | \n",
" 1 | \n",
" [311] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2679 | \n",
" Zahnräder der Laufkatze verschlissen Ersatztei... | \n",
" 170 | \n",
" 1 | \n",
" [415] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2677 | \n",
" Schalter für Bühne Schwenken abgerissen, bitte... | \n",
" 126 | \n",
" 1 | \n",
" [323] | \n",
" 1 | \n",
"
\n",
" \n",
" | 2676 | \n",
" Docke angefahren! | \n",
" 17 | \n",
" 1 | \n",
" [176] | \n",
" 1 | \n",
"
\n",
" \n",
" | 6799 | \n",
" Befestigung Deckel für Batteriefach defekt ... | \n",
" 107 | \n",
" 2 | \n",
" [306, 326] | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
4582 rows × 5 columns
\n",
"
"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 3111 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 2 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 74 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2680 [311] 1 \n",
"2679 [415] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [306, 326] 2 \n",
"\n",
"[4582 rows x 5 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "154111fe-24cc-47a1-9de2-56e1dcf36f67",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}