lang-main/notebooks/truncate_dataset.ipynb
2024-08-07 20:06:06 +02:00

825 lines
26 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "79034f9b-adae-4066-a35f-b0e7fd38055f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n",
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n",
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from ihm_analyse.lib.preprocess import load_raw_data\n",
"from ihm_analyse import load_pickle\n",
"from ihm_analyse.predefined_pipes import pipe_merge"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "af94968f-ae6c-402b-b866-cb6c15b81cef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pwd = os.getcwd()\n",
"pwd = Path(pwd)\n",
"p = pwd / '01_03_Rohdaten_202403/'\n",
"p"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "753daf9e-0209-4a13-b458-1048c8b2bfbf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export6 - 43306 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export8 - 708 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export9 - 8176 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_trunc.csv')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"folder = list(p.glob(r'*.csv'))\n",
"folder"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50d9ef9c-c56b-4d5b-9dfd-c02b68a29288",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "52186a59-69f2-4ed2-8d19-dac76e50526a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"id": "5b76284b-bcc3-4b31-9ece-bde35b22b717",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_59499_Zeilen.csv')"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_dataset = folder[1]\n",
"path_to_dataset"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2701b8d9-657c-4d7a-b103-b8c8b1865224",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n",
"INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 59499, number of features 20\n"
]
}
],
"source": [
"(data,) = load_raw_data(path_to_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "74d7b72e-3cab-46e2-bca2-67c70b9221c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17849"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"percentage_trunc = 0.3\n",
"num_entries_trunc = int(len(data) * percentage_trunc)\n",
"num_entries_trunc"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "62d2fa0e-baa6-4d7c-bd37-5fdbe21005d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 17849 entries, 0 to 17848\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 VorgangsID 17849 non-null int64 \n",
" 1 ObjektID 17849 non-null int64 \n",
" 2 HObjektText 17848 non-null object \n",
" 3 ObjektArtID 17849 non-null int64 \n",
" 4 ObjektArtText 17849 non-null object \n",
" 5 VorgangsTypID 17849 non-null int64 \n",
" 6 VorgangsTypName 17849 non-null object \n",
" 7 VorgangsDatum 17849 non-null datetime64[ns]\n",
" 8 VorgangsStatusId 17849 non-null int64 \n",
" 9 VorgangsPrioritaet 17849 non-null int64 \n",
" 10 VorgangsBeschreibung 15988 non-null object \n",
" 11 VorgangsOrt 0 non-null float64 \n",
" 12 VorgangsArtText 17849 non-null object \n",
" 13 ErledigungsDatum 17849 non-null datetime64[ns]\n",
" 14 ErledigungsArtText 11879 non-null object \n",
" 15 ErledigungsBeschreibung 9916 non-null object \n",
" 16 MPMelderArbeitsplatz 3 non-null object \n",
" 17 MPAbteilungBezeichnung 3 non-null object \n",
" 18 Arbeitsbeginn 1920 non-null datetime64[ns]\n",
" 19 ErstellungsDatum 17849 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](4), float64(1), int64(6), object(9)\n",
"memory usage: 2.7+ MB\n"
]
}
],
"source": [
"data_trunc = data.iloc[:num_entries_trunc].copy()\n",
"data_trunc.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "6a15fb8c-e3b7-4c92-b73d-788b337d6251",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_trunc.csv')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"saving_path = p / 'Export7_trunc.csv'\n",
"saving_path"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "fb912634-cefa-4b8d-a370-37f6c8178f5a",
"metadata": {},
"outputs": [],
"source": [
"data_trunc.to_csv(\n",
" path_or_buf=saving_path,\n",
" sep=';', \n",
" encoding='cp1252', \n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30085691-aa23-478c-8d65-d3e6800c7c77",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "85733e19-6c52-479c-a8f0-3872bdbd5bfd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae65a019-26a8-45c9-bfb9-2662b84ff2f2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ba722ae-51b8-4e8d-9a1a-917098a3f70e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"id": "356f7d32-446e-4dc1-aa83-a0b816742087",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-3_remove_NA.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-5_analyse_feature.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-1_build_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Token_Analysis_Step-1_build_token_graph.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_path = pwd / 'results/Export7_trunc/'\n",
"contents = list(res_path.glob(r'*.pickle'))\n",
"contents"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "e4415e9c-6ebb-46d9-b06a-eb67df56689e",
"metadata": {},
"outputs": [],
"source": [
"preproc_data = contents[1]\n",
"last_step = contents[-1]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "2b29672a-c573-4d09-8601-e468a23bad0c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"ret_preproc_data = load_pickle(preproc_data)\n",
"ret_idx_paris = load_pickle(last_step)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "20c807da-e64f-4a48-8306-28a0a3dcfae9",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[53], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m idx_pairs \u001b[38;5;241m=\u001b[39m \u001b[43mret_idx_paris\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 2\u001b[0m preproc_data \u001b[38;5;241m=\u001b[39m ret_preproc_data[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[1;32m~\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\networkx\\classes\\graph.py:513\u001b[0m, in \u001b[0;36mGraph.__getitem__\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 489\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, n):\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a dict of neighbors of node n. Use: 'G[n]'.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 511\u001b[0m \u001b[38;5;124;03m AtlasView({1: {}})\u001b[39;00m\n\u001b[0;32m 512\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madj\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m\n",
"File \u001b[1;32m~\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\networkx\\classes\\coreviews.py:81\u001b[0m, in \u001b[0;36mAdjacencyView.__getitem__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, name):\n\u001b[1;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m AtlasView(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_atlas\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m)\n",
"\u001b[1;31mKeyError\u001b[0m: 0"
]
}
],
"source": [
"idx_pairs = ret_idx_paris[0]\n",
"preproc_data = ret_preproc_data[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1fca8442-c0e9-420f-9ad4-22a3b672dda3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline >>Merge_Duplicates<<...\n",
"INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n",
"INFO:ihm_analyse.graphs:Graph properties: 5465 Nodes, 71087 Edges\n",
"INFO:ihm_analyse.graphs:Node memory: 149.43 KB\n",
"INFO:ihm_analyse.graphs:Edge memory: 3887.57 KB\n",
"INFO:ihm_analyse.graphs:Total memory: 4037.00 KB\n",
"INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under results\\Export7_trunc\\Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle\n",
"INFO:ihm_analyse.pipelines:Processing pipeline >>Merge_Duplicates<< successfully ended.\n"
]
}
],
"source": [
"ret = pipe_merge.run(starting_values=(preproc_data, idx_pairs))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "87bd8bba-b0c3-45a1-a9a8-b6bd279cf51f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>445</th>\n",
" <td>Wartung nach Arbeitsplan, siehe Extradaten / A...</td>\n",
" <td>52</td>\n",
" <td>3435</td>\n",
" <td>[563, 604, 616, 617, 15089, 15226, 15276, 1533...</td>\n",
" <td>36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>I/W nach Liste</td>\n",
" <td>14</td>\n",
" <td>238</td>\n",
" <td>[2363, 2364, 2367, 2368, 2369, 2370, 2371, 237...</td>\n",
" <td>85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2377</th>\n",
" <td>1 Wöchentliche Wartung aller 3 Etikettendrucke...</td>\n",
" <td>91</td>\n",
" <td>535</td>\n",
" <td>[111, 121, 127, 209, 219, 220, 221, 222, 236, ...</td>\n",
" <td>73</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2380</th>\n",
" <td>Infratech Meet Di + DO JourFix PT/InT</td>\n",
" <td>38</td>\n",
" <td>183</td>\n",
" <td>28526</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4337</th>\n",
" <td>24.05.2022 10:28:01 (Halm, Karl-Josef) Aktione...</td>\n",
" <td>579</td>\n",
" <td>3817</td>\n",
" <td>[5, 7, 9, 13, 14, 15, 17, 18, 24, 25, 30, 32, ...</td>\n",
" <td>754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3061</th>\n",
" <td>stopper schaltet nicht.</td>\n",
" <td>23</td>\n",
" <td>1</td>\n",
" <td>[15280]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3059</th>\n",
" <td>12.09.2022 13:48:24 (Struzyna, Christian) Temp...</td>\n",
" <td>127</td>\n",
" <td>1</td>\n",
" <td>[12671]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3054</th>\n",
" <td>08.09.2022 12:56:33 (Unruh, Jakob) Neue Serie ...</td>\n",
" <td>262</td>\n",
" <td>1</td>\n",
" <td>[273]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3053</th>\n",
" <td>Preset-Punkt überprüfen und ggf. nachjustieren...</td>\n",
" <td>148</td>\n",
" <td>1</td>\n",
" <td>[273]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3050</th>\n",
" <td>13.09.2022 08:05:40 (Betke, Gennadi) Griefer ...</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>[15785]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3627 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"445 Wartung nach Arbeitsplan, siehe Extradaten / A... 52 3435 \n",
"26 I/W nach Liste 14 238 \n",
"2377 1 Wöchentliche Wartung aller 3 Etikettendrucke... 91 535 \n",
"2380 Infratech Meet Di + DO JourFix PT/InT 38 183 \n",
"4337 24.05.2022 10:28:01 (Halm, Karl-Josef) Aktione... 579 3817 \n",
"... ... ... ... \n",
"3061 stopper schaltet nicht. 23 1 \n",
"3059 12.09.2022 13:48:24 (Struzyna, Christian) Temp... 127 1 \n",
"3054 08.09.2022 12:56:33 (Unruh, Jakob) Neue Serie ... 262 1 \n",
"3053 Preset-Punkt überprüfen und ggf. nachjustieren... 148 1 \n",
"3050 13.09.2022 08:05:40 (Betke, Gennadi) Griefer ... 79 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"445 [563, 604, 616, 617, 15089, 15226, 15276, 1533... 36 \n",
"26 [2363, 2364, 2367, 2368, 2369, 2370, 2371, 237... 85 \n",
"2377 [111, 121, 127, 209, 219, 220, 221, 222, 236, ... 73 \n",
"2380 28526 1 \n",
"4337 [5, 7, 9, 13, 14, 15, 17, 18, 24, 25, 30, 32, ... 754 \n",
"... ... ... \n",
"3061 [15280] 1 \n",
"3059 [12671] 1 \n",
"3054 [273] 1 \n",
"3053 [273] 1 \n",
"3050 [15785] 1 \n",
"\n",
"[3627 rows x 5 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43e6d41c-7a49-4756-9629-0ec0ee6c5b7c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1a9969fa-6b0d-466a-bd4f-1ba5f4868873",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" int('23456')\n",
"except ValueError:\n",
" print('went wrong')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76af32de-5f0a-4d7e-9751-5f2a38a7a69e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7bd5cffa-0b09-45c7-bc15-0cd3082353d7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e8423609-c95d-42c8-99f3-95274fa52ae8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-3_remove_NA.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-5_analyse_feature.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-1_build_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Token_Analysis_Step-1_build_token_graph.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_path = pwd / 'results/Export7_trunc/'\n",
"contents = list(res_path.glob(r'*.pickle'))\n",
"contents"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "71fa1c2e-22cf-483a-964c-a5cca2bd3790",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_graph = contents[-1]\n",
"path_to_graph"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9101beaf-6a7c-4987-9c44-141386966291",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"tk_graph = load_pickle(path_to_graph)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e08954c3-9a5f-43c8-a98e-f9b8a74c3ff5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 10536, number of edges: 48562)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2a8d0abb-d68b-4c6e-80e9-b3b27998c8d2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 10536,\n",
" 'num_edges': 46393,\n",
" 'min_edge_weight': 1,\n",
" 'max_edge_weight': 15374,\n",
" 'node_memory': 652596,\n",
" 'edge_memory': 2598008,\n",
" 'total_memory': 3250604}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph.metadata_undirected"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "cc34c667-5a33-4061-a83a-50fc8c537b19",
"metadata": {},
"outputs": [],
"source": [
"tk_graph_filtered = tk_graph.filter_by_edge_weight(100)\n",
"tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "67df971c-fe7a-4f88-89ae-ba1366da1166",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 289,\n",
" 'num_edges': 457,\n",
" 'min_edge_weight': 100,\n",
" 'max_edge_weight': 15369,\n",
" 'node_memory': 17674,\n",
" 'edge_memory': 25592,\n",
" 'total_memory': 43266}"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph_filtered.metadata_undirected"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "8c524312-aff4-47f4-801e-ad8112aa2a70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc')"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"save_path_graph = res_path\n",
"save_path_graph"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "b62c888f-620d-4b29-924b-45ea17d99bc1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.graphs:Successfully saved graph as GraphML file under A:\\Arbeitsaufgaben\\Instandhaltung\\results\\Export7_trunc\\TokenGraph-filtered.graphml.\n"
]
}
],
"source": [
"tk_graph_filtered.save_graph(save_path_graph, filename='TokenGraph-filtered')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca21e2d3-dc5a-4117-8be9-d132ba2c8d28",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}