lang-main/test-notebooks/Preprocess_Pipeline.ipynb
Florian Förster 9edcd5be4e initial commit
2024-05-08 14:46:43 +02:00

1090 lines
37 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c75d375c-3fe4-4bf4-a987-f2ceb5f98072",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from ihm_analyse import CONFIG\n",
"from ihm_analyse.lib.preprocess import (\n",
" load_raw_data,\n",
" remove_duplicates,\n",
" remove_NA,\n",
" clean_string_slim,\n",
" entry_wise_cleansing,\n",
" analyse_feature,\n",
" build_cosSim_matrix,\n",
" filt_thresh_cosSim_matrix,\n",
" list_cosSim_dupl_candidates,\n",
" merge_similarity_dupl,\n",
")\n",
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
"from ihm_analyse.lib.helpers import (\n",
" save_pickle, \n",
" load_pickle, \n",
" create_saving_folder,\n",
" load_toml_config,\n",
")\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"import spacy\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"id": "b162b112-a6f6-42a9-9929-19fc32ba181c",
"metadata": {},
"source": [
"# Preprocessing\n",
"\n",
"## Whole Dataset"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c797bcf3-a982-4717-b654-ae8802420a99",
"metadata": {},
"outputs": [],
"source": [
"# constants and other pre-defined variables\n",
"DATA_SET_ID = 'Export4'\n",
"FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'\n",
"date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']\n",
"\n",
"SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'\n",
"create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)\n",
"\n",
"path_raw_data = Path(FILE_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "994ce600-198c-4e2c-977b-b4af1b77766b",
"metadata": {},
"outputs": [],
"source": [
"pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)\n",
"pipe.add(load_raw_data, {'date_cols': date_cols})\n",
"pipe.add(remove_duplicates)\n",
"pipe.add(remove_NA, save_result=True)\n",
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})\n",
"pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})\n",
"pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)\n",
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e83b6de7-d2a1-4b45-ae16-50347ca677ef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "45f67843-e44d-4168-843c-762d84f66284",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n",
"INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20\n",
"INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84\n",
"INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.\n",
"INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20\n",
"INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl\n",
"INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<\n",
"INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"ret = pipe.run(starting_values=(path_raw_data,))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "733539d8-48e7-40bd-b412-65037514f064",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>1654</td>\n",
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1616</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
" <td>48</td>\n",
" <td>1</td>\n",
" <td>[140]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6799</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>107</td>\n",
" <td>1</td>\n",
" <td>[326]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6800 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": 171,
"id": "a4e2d4c3-8832-45cd-b90b-6065714f6bce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"# intermediate load: loading of intermediate results\n",
"ret = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')\n",
"pre_1 = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-5_analyse_feature')\n",
"preprocessed_data = pre_1[0]\n",
"#ret = intermediate_load(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')"
]
},
{
"cell_type": "code",
"execution_count": 172,
"id": "23590b29-e31e-4bfd-bc8c-d9c23dd2e363",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>1654</td>\n",
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1616</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
" <td>48</td>\n",
" <td>1</td>\n",
" <td>[140]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6799</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>107</td>\n",
" <td>1</td>\n",
" <td>[326]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6800 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed_data"
]
},
{
"cell_type": "markdown",
"id": "09fd738a-ee8e-447e-8c3c-7de8566cca32",
"metadata": {},
"source": [
"## Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "946e29e0-6253-4c30-9c96-03d3b721abca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n",
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
]
}
],
"source": [
"SIMILARITY_THRESHOLD = CONFIG['preprocess']['cosine_similarity_threshold']\n",
"FILENAME_COSSIM_CANDFILT_WHOLE = 'CosSim-FiltCand'\n",
"\n",
"pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)\n",
"nlp = spacy.load('de_dep_news_trf')\n",
"model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "c7c3375a-cc8b-4100-84e7-9608074aab2d",
"metadata": {},
"outputs": [],
"source": [
"pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)\n",
"pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': SIMILARITY_THRESHOLD}, save_result=True)\n",
"pipe_embds.add(\n",
" list_cosSim_dupl_candidates, \n",
" {'save_candidates': True, \n",
" 'saving_path': SAVE_PATH_FOLDER,\n",
" 'filename': FILENAME_COSSIM_CANDFILT_WHOLE,\n",
" 'pipeline': pipe_embds}, save_result=True)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "ff1ac9cd-66fd-4ec8-bca2-f48b13e5c943",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ...\n",
"33 Wöchentliche Sichtkontrolle / Reinigung\n",
"131 Tägliche Überprüfung der Ölabscheider\n",
"160 Wöchentliche Kontrolle der WC-Anlagen\n",
"140 Halbjährliche Kontrolle des Stabbreithalters\n",
" ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei...\n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke.\n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte...\n",
"2676 Docke angefahren!\n",
"6799 Befestigung Deckel für Batteriefach defekt ...\n",
"Name: entry, Length: 6787, dtype: object"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# eliminate descriptions with less than 6 symbols\n",
"subset_data = preprocessed_data.loc[preprocessed_data['len'] > 5, 'entry'].copy()\n",
"\n",
"#start_val = subset_data.iloc[:20].copy()\n",
"start_val = subset_data.copy()\n",
"start_val"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "bd94dbbe-cfd3-45f2-8b8e-127103fce2f1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Start building embedding map...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████| 6787/6787 [06:08<00:00, 18.43it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Embedding map built successfully.\n",
"INFO:ihm_analyse.preprocess:Start calculation of similarity scores...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|███████████████████████████████████████████████████████████████████| 23028291/23028291 [18:00<00:00, 21305.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity scores calculated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl\n",
"INFO:ihm_analyse.preprocess:Start gathering of similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████| 9331/9331 [00:03<00:00, 2737.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates gathered successfully.\n",
"INFO:ihm_analyse.preprocess:Saving similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates saved successfully to >>./results/Export4/PipeStep_3_CosSim-FiltCand.xlsx<<.\n",
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"dupl_idx_pairs, embds = pipe_embds.run(starting_values=(start_val,))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "dbf4b2be-2486-4859-a6e8-541fadfe6e6f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-1_build_cosSim_matrix')\n",
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix')"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "91edbe35-1694-4f22-a777-1a1b6577a807",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9331"
]
},
"execution_count": 175,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dupl_idx_pairs)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "47cee17a-9beb-49c0-97d1-d6380942cf16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'./results/Export4/dupl_idx_pairs.pkl'"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_idx_pairs = SAVE_PATH_FOLDER + 'dupl_idx_pairs.pkl'\n",
"path_to_idx_pairs"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "284bab2e-e704-4f75-9599-0d80e7f98894",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"#save_pickle(obj=dupl_idx_pairs, path=path_to_idx_pairs)\n",
"dupl_idx_pairs = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='dupl_idx_pairs')"
]
},
{
"cell_type": "markdown",
"id": "88de908f-9da7-4baf-9f34-61b0cd435c6b",
"metadata": {},
"source": [
"## Merge Duplicate Candidates"
]
},
{
"cell_type": "code",
"execution_count": 176,
"id": "1e7a759b-7899-44e1-9673-7aed87db57fd",
"metadata": {},
"outputs": [],
"source": [
"prep_data = preprocessed_data.copy()\n",
"\n",
"pipe_3 = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "2cde0c91-9847-4230-a0f3-89e292810e07",
"metadata": {},
"outputs": [],
"source": [
"pipe_3.add(merge_similarity_dupl, save_result=True)"
]
},
{
"cell_type": "code",
"execution_count": 178,
"id": "cba685ba-8e8c-4890-8df6-652753346e5d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████| 9331/9331 [00:00<00:00, 10511.31it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"# merge duplicate candidates\n",
"# info needed: preprocessed data + idx pairs of duplicate candidates\n",
"ret = pipe_3.run(starting_values=(prep_data, dupl_idx_pairs))"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "9f8b13ca-eeaa-46a0-b791-10d227d25db8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>2163</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2681</th>\n",
" <td>vom Eisenkernvorrichtung (Teil vom Kettenlauf ...</td>\n",
" <td>136</td>\n",
" <td>1</td>\n",
" <td>[515]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2680</th>\n",
" <td>Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...</td>\n",
" <td>260</td>\n",
" <td>1</td>\n",
" <td>[311]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5090 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n",
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2681 [515] 1 \n",
"2680 [311] 1 \n",
"2679 [415] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"\n",
"[5090 rows x 5 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret"
]
},
{
"cell_type": "markdown",
"id": "9b1df142-f790-41e2-9039-d6c467957fd4",
"metadata": {},
"source": [
"# End Preprocessing"
]
},
{
"cell_type": "markdown",
"id": "e5728951-4515-48b1-b9c1-5694dcae2ba4",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}