1090 lines
37 KiB
Plaintext
1090 lines
37 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "c75d375c-3fe4-4bf4-a987-f2ceb5f98072",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"%load_ext autoreload\n",
|
||
"%autoreload 2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from ihm_analyse import CONFIG\n",
|
||
"from ihm_analyse.lib.preprocess import (\n",
|
||
" load_raw_data,\n",
|
||
" remove_duplicates,\n",
|
||
" remove_NA,\n",
|
||
" clean_string_slim,\n",
|
||
" entry_wise_cleansing,\n",
|
||
" analyse_feature,\n",
|
||
" build_cosSim_matrix,\n",
|
||
" filt_thresh_cosSim_matrix,\n",
|
||
" list_cosSim_dupl_candidates,\n",
|
||
" merge_similarity_dupl,\n",
|
||
")\n",
|
||
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||
"from ihm_analyse.lib.helpers import (\n",
|
||
" save_pickle, \n",
|
||
" load_pickle, \n",
|
||
" create_saving_folder,\n",
|
||
" load_toml_config,\n",
|
||
")\n",
|
||
"\n",
|
||
"from sentence_transformers import SentenceTransformer\n",
|
||
"import spacy\n",
|
||
"from pathlib import Path"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "b162b112-a6f6-42a9-9929-19fc32ba181c",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Preprocessing\n",
|
||
"\n",
|
||
"## Whole Dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "c797bcf3-a982-4717-b654-ae8802420a99",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# constants and other pre-defined variables\n",
|
||
"DATA_SET_ID = 'Export4'\n",
|
||
"FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'\n",
|
||
"date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']\n",
|
||
"\n",
|
||
"SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'\n",
|
||
"create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)\n",
|
||
"\n",
|
||
"path_raw_data = Path(FILE_PATH)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "994ce600-198c-4e2c-977b-b4af1b77766b",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)\n",
|
||
"pipe.add(load_raw_data, {'date_cols': date_cols})\n",
|
||
"pipe.add(remove_duplicates)\n",
|
||
"pipe.add(remove_NA, save_result=True)\n",
|
||
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})\n",
|
||
"pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})\n",
|
||
"pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)\n",
|
||
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "e83b6de7-d2a1-4b45-ae16-50347ca677ef",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pipe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "45f67843-e44d-4168-843c-762d84f66284",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
|
||
"INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n",
|
||
"INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20\n",
|
||
"INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84\n",
|
||
"INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.\n",
|
||
"INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20\n",
|
||
"INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.\n",
|
||
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl\n",
|
||
"INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<\n",
|
||
"INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"ret = pipe.run(starting_values=(path_raw_data,))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "733539d8-48e7-40bd-b412-65037514f064",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>entry</th>\n",
|
||
" <th>len</th>\n",
|
||
" <th>num_occur</th>\n",
|
||
" <th>assoc_obj_ids</th>\n",
|
||
" <th>num_assoc_obj_ids</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>162</th>\n",
|
||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>92592</td>\n",
|
||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||
" <td>206</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>1654</td>\n",
|
||
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
|
||
" <td>18</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>131</th>\n",
|
||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1616</td>\n",
|
||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1265</td>\n",
|
||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||
" <td>44</td>\n",
|
||
" <td>687</td>\n",
|
||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||
" <td>166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2679</th>\n",
|
||
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[415]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2678</th>\n",
|
||
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
|
||
" <td>48</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[140]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2677</th>\n",
|
||
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
|
||
" <td>126</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[323]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2676</th>\n",
|
||
" <td>Docke angefahren!</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[176]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6799</th>\n",
|
||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||
" <td>107</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[326]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6800 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" entry len num_occur \\\n",
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
|
||
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
|
||
"... ... ... ... \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
|
||
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
|
||
"2676 Docke angefahren! 17 1 \n",
|
||
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
|
||
"\n",
|
||
" assoc_obj_ids num_assoc_obj_ids \n",
|
||
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
|
||
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
|
||
"131 [0, 970, 2134, 2137] 4 \n",
|
||
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
|
||
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
|
||
"... ... ... \n",
|
||
"2679 [415] 1 \n",
|
||
"2678 [140] 1 \n",
|
||
"2677 [323] 1 \n",
|
||
"2676 [176] 1 \n",
|
||
"6799 [326] 1 \n",
|
||
"\n",
|
||
"[6800 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ret[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 171,
|
||
"id": "a4e2d4c3-8832-45cd-b90b-6065714f6bce",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
|
||
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# intermediate load: loading of intermediate results\n",
|
||
"ret = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')\n",
|
||
"pre_1 = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-5_analyse_feature')\n",
|
||
"preprocessed_data = pre_1[0]\n",
|
||
"#ret = intermediate_load(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 172,
|
||
"id": "23590b29-e31e-4bfd-bc8c-d9c23dd2e363",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>entry</th>\n",
|
||
" <th>len</th>\n",
|
||
" <th>num_occur</th>\n",
|
||
" <th>assoc_obj_ids</th>\n",
|
||
" <th>num_assoc_obj_ids</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>162</th>\n",
|
||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>92592</td>\n",
|
||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||
" <td>206</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>1654</td>\n",
|
||
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
|
||
" <td>18</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>131</th>\n",
|
||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1616</td>\n",
|
||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1265</td>\n",
|
||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||
" <td>44</td>\n",
|
||
" <td>687</td>\n",
|
||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||
" <td>166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2679</th>\n",
|
||
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[415]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2678</th>\n",
|
||
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
|
||
" <td>48</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[140]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2677</th>\n",
|
||
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
|
||
" <td>126</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[323]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2676</th>\n",
|
||
" <td>Docke angefahren!</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[176]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6799</th>\n",
|
||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||
" <td>107</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[326]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6800 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" entry len num_occur \\\n",
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
|
||
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
|
||
"... ... ... ... \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
|
||
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
|
||
"2676 Docke angefahren! 17 1 \n",
|
||
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
|
||
"\n",
|
||
" assoc_obj_ids num_assoc_obj_ids \n",
|
||
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
|
||
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
|
||
"131 [0, 970, 2134, 2137] 4 \n",
|
||
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
|
||
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
|
||
"... ... ... \n",
|
||
"2679 [415] 1 \n",
|
||
"2678 [140] 1 \n",
|
||
"2677 [323] 1 \n",
|
||
"2676 [176] 1 \n",
|
||
"6799 [326] 1 \n",
|
||
"\n",
|
||
"[6800 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 172,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"preprocessed_data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "09fd738a-ee8e-447e-8c3c-7de8566cca32",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Embeddings"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"id": "946e29e0-6253-4c30-9c96-03d3b721abca",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n",
|
||
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"SIMILARITY_THRESHOLD = CONFIG['preprocess']['cosine_similarity_threshold']\n",
|
||
"FILENAME_COSSIM_CANDFILT_WHOLE = 'CosSim-FiltCand'\n",
|
||
"\n",
|
||
"pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)\n",
|
||
"nlp = spacy.load('de_dep_news_trf')\n",
|
||
"model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"id": "c7c3375a-cc8b-4100-84e7-9608074aab2d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)\n",
|
||
"pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': SIMILARITY_THRESHOLD}, save_result=True)\n",
|
||
"pipe_embds.add(\n",
|
||
" list_cosSim_dupl_candidates, \n",
|
||
" {'save_candidates': True, \n",
|
||
" 'saving_path': SAVE_PATH_FOLDER,\n",
|
||
" 'filename': FILENAME_COSSIM_CANDFILT_WHOLE,\n",
|
||
" 'pipeline': pipe_embds}, save_result=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"id": "ff1ac9cd-66fd-4ec8-bca2-f48b13e5c943",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ...\n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung\n",
|
||
"131 Tägliche Überprüfung der Ölabscheider\n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen\n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters\n",
|
||
" ... \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei...\n",
|
||
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke.\n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte...\n",
|
||
"2676 Docke angefahren!\n",
|
||
"6799 Befestigung Deckel für Batteriefach defekt ...\n",
|
||
"Name: entry, Length: 6787, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 58,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# eliminate descriptions with less than 6 symbols\n",
|
||
"subset_data = preprocessed_data.loc[preprocessed_data['len'] > 5, 'entry'].copy()\n",
|
||
"\n",
|
||
"#start_val = subset_data.iloc[:20].copy()\n",
|
||
"start_val = subset_data.copy()\n",
|
||
"start_val"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 59,
|
||
"id": "bd94dbbe-cfd3-45f2-8b8e-127103fce2f1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
|
||
"INFO:ihm_analyse.preprocess:Start building embedding map...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|██████████████████████████████████████████████████████████████████████████████| 6787/6787 [06:08<00:00, 18.43it/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.preprocess:Embedding map built successfully.\n",
|
||
"INFO:ihm_analyse.preprocess:Start calculation of similarity scores...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"100%|███████████████████████████████████████████████████████████████████| 23028291/23028291 [18:00<00:00, 21305.85it/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.preprocess:Similarity scores calculated successfully.\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl\n",
|
||
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl\n",
|
||
"INFO:ihm_analyse.preprocess:Start gathering of similarity candidates...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|████████████████████████████████████████████████████████████████████████████| 9331/9331 [00:03<00:00, 2737.75it/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.preprocess:Similarity candidates gathered successfully.\n",
|
||
"INFO:ihm_analyse.preprocess:Saving similarity candidates...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.preprocess:Similarity candidates saved successfully to >>./results/Export4/PipeStep_3_CosSim-FiltCand.xlsx<<.\n",
|
||
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dupl_idx_pairs, embds = pipe_embds.run(starting_values=(start_val,))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"id": "dbf4b2be-2486-4859-a6e8-541fadfe6e6f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
|
||
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-1_build_cosSim_matrix')\n",
|
||
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 175,
|
||
"id": "91edbe35-1694-4f22-a777-1a1b6577a807",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"9331"
|
||
]
|
||
},
|
||
"execution_count": 175,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(dupl_idx_pairs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 166,
|
||
"id": "47cee17a-9beb-49c0-97d1-d6380942cf16",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'./results/Export4/dupl_idx_pairs.pkl'"
|
||
]
|
||
},
|
||
"execution_count": 166,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"path_to_idx_pairs = SAVE_PATH_FOLDER + 'dupl_idx_pairs.pkl'\n",
|
||
"path_to_idx_pairs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 174,
|
||
"id": "284bab2e-e704-4f75-9599-0d80e7f98894",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#save_pickle(obj=dupl_idx_pairs, path=path_to_idx_pairs)\n",
|
||
"dupl_idx_pairs = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='dupl_idx_pairs')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "88de908f-9da7-4baf-9f34-61b0cd435c6b",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Merge Duplicate Candidates"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 176,
|
||
"id": "1e7a759b-7899-44e1-9673-7aed87db57fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"prep_data = preprocessed_data.copy()\n",
|
||
"\n",
|
||
"pipe_3 = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 177,
|
||
"id": "2cde0c91-9847-4230-a0f3-89e292810e07",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pipe_3.add(merge_similarity_dupl, save_result=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 178,
|
||
"id": "cba685ba-8e8c-4890-8df6-652753346e5d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
|
||
"INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"100%|███████████████████████████████████████████████████████████████████████████| 9331/9331 [00:00<00:00, 10511.31it/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n",
|
||
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# merge duplicate candidates\n",
|
||
"# info needed: preprocessed data + idx pairs of duplicate candidates\n",
|
||
"ret = pipe_3.run(starting_values=(prep_data, dupl_idx_pairs))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"id": "9f8b13ca-eeaa-46a0-b791-10d227d25db8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>entry</th>\n",
|
||
" <th>len</th>\n",
|
||
" <th>num_occur</th>\n",
|
||
" <th>assoc_obj_ids</th>\n",
|
||
" <th>num_assoc_obj_ids</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>162</th>\n",
|
||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>92592</td>\n",
|
||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||
" <td>206</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>2163</td>\n",
|
||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||
" <td>27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>131</th>\n",
|
||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1619</td>\n",
|
||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1265</td>\n",
|
||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||
" <td>44</td>\n",
|
||
" <td>687</td>\n",
|
||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||
" <td>166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2681</th>\n",
|
||
" <td>vom Eisenkernvorrichtung (Teil vom Kettenlauf ...</td>\n",
|
||
" <td>136</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[515]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2680</th>\n",
|
||
" <td>Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[311]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2679</th>\n",
|
||
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[415]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2677</th>\n",
|
||
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
|
||
" <td>126</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[323]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2676</th>\n",
|
||
" <td>Docke angefahren!</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[176]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5090 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" entry len num_occur \\\n",
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n",
|
||
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
|
||
"... ... ... ... \n",
|
||
"2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n",
|
||
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
|
||
"2676 Docke angefahren! 17 1 \n",
|
||
"\n",
|
||
" assoc_obj_ids num_assoc_obj_ids \n",
|
||
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
|
||
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n",
|
||
"131 [0, 970, 2134, 2137] 4 \n",
|
||
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
|
||
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
|
||
"... ... ... \n",
|
||
"2681 [515] 1 \n",
|
||
"2680 [311] 1 \n",
|
||
"2679 [415] 1 \n",
|
||
"2677 [323] 1 \n",
|
||
"2676 [176] 1 \n",
|
||
"\n",
|
||
"[5090 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ret"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "9b1df142-f790-41e2-9039-d6c467957fd4",
|
||
"metadata": {},
|
||
"source": [
|
||
"# End Preprocessing"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e5728951-4515-48b1-b9c1-5694dcae2ba4",
|
||
"metadata": {},
|
||
"source": [
|
||
"---"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.8"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|