lang-main/notebooks/Preprocess_Pipeline.ipynb
2024-08-07 20:06:06 +02:00

1086 lines
38 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c75d375c-3fe4-4bf4-a987-f2ceb5f98072",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'ihm_analyse'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
]
}
],
"source": [
"from lang_main import CONFIG\n",
"from lang_main.lib.preprocess import (\n",
" load_raw_data,\n",
" remove_duplicates,\n",
" remove_NA,\n",
" clean_string_slim,\n",
" entry_wise_cleansing,\n",
" analyse_feature,\n",
" build_cosSim_matrix,\n",
" filt_thresh_cosSim_matrix,\n",
" list_cosSim_dupl_candidates,\n",
" merge_similarity_dupl,\n",
")\n",
"from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
"from lang_main.lib.helpers import (\n",
" save_pickle, \n",
" load_pickle, \n",
" create_saving_folder,\n",
" load_toml_config,\n",
")\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"import spacy\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"id": "b162b112-a6f6-42a9-9929-19fc32ba181c",
"metadata": {},
"source": [
"# Preprocessing\n",
"\n",
"## Whole Dataset"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c797bcf3-a982-4717-b654-ae8802420a99",
"metadata": {},
"outputs": [],
"source": [
"# constants and other pre-defined variables\n",
"DATA_SET_ID = 'Export4'\n",
"FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'\n",
"date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']\n",
"\n",
"SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'\n",
"create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)\n",
"\n",
"path_raw_data = Path(FILE_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "994ce600-198c-4e2c-977b-b4af1b77766b",
"metadata": {},
"outputs": [],
"source": [
"pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)\n",
"pipe.add(load_raw_data, {'date_cols': date_cols})\n",
"pipe.add(remove_duplicates)\n",
"pipe.add(remove_NA, save_result=True)\n",
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})\n",
"pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})\n",
"pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)\n",
"#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e83b6de7-d2a1-4b45-ae16-50347ca677ef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "45f67843-e44d-4168-843c-762d84f66284",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n",
"INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20\n",
"INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84\n",
"INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.\n",
"INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20\n",
"INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl\n",
"INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<\n",
"INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"ret = pipe.run(starting_values=(path_raw_data,))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "733539d8-48e7-40bd-b412-65037514f064",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>1654</td>\n",
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1616</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
" <td>48</td>\n",
" <td>1</td>\n",
" <td>[140]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6799</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>107</td>\n",
" <td>1</td>\n",
" <td>[326]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6800 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": 171,
"id": "a4e2d4c3-8832-45cd-b90b-6065714f6bce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"# intermediate load: loading of intermediate results\n",
"ret = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')\n",
"pre_1 = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-5_analyse_feature')\n",
"preprocessed_data = pre_1[0]\n",
"#ret = intermediate_load(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')"
]
},
{
"cell_type": "code",
"execution_count": 172,
"id": "23590b29-e31e-4bfd-bc8c-d9c23dd2e363",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>1654</td>\n",
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1616</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
" <td>48</td>\n",
" <td>1</td>\n",
" <td>[140]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6799</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>107</td>\n",
" <td>1</td>\n",
" <td>[326]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6800 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed_data"
]
},
{
"cell_type": "markdown",
"id": "09fd738a-ee8e-447e-8c3c-7de8566cca32",
"metadata": {},
"source": [
"## Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "946e29e0-6253-4c30-9c96-03d3b721abca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n",
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
]
}
],
"source": [
"SIMILARITY_THRESHOLD = CONFIG['preprocess']['cosine_similarity_threshold']\n",
"FILENAME_COSSIM_CANDFILT_WHOLE = 'CosSim-FiltCand'\n",
"\n",
"pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)\n",
"nlp = spacy.load('de_dep_news_trf')\n",
"model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "c7c3375a-cc8b-4100-84e7-9608074aab2d",
"metadata": {},
"outputs": [],
"source": [
"pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)\n",
"pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': SIMILARITY_THRESHOLD}, save_result=True)\n",
"pipe_embds.add(\n",
" list_cosSim_dupl_candidates, \n",
" {'save_candidates': True, \n",
" 'saving_path': SAVE_PATH_FOLDER,\n",
" 'filename': FILENAME_COSSIM_CANDFILT_WHOLE,\n",
" 'pipeline': pipe_embds}, save_result=True)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "ff1ac9cd-66fd-4ec8-bca2-f48b13e5c943",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ...\n",
"33 Wöchentliche Sichtkontrolle / Reinigung\n",
"131 Tägliche Überprüfung der Ölabscheider\n",
"160 Wöchentliche Kontrolle der WC-Anlagen\n",
"140 Halbjährliche Kontrolle des Stabbreithalters\n",
" ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei...\n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke.\n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte...\n",
"2676 Docke angefahren!\n",
"6799 Befestigung Deckel für Batteriefach defekt ...\n",
"Name: entry, Length: 6787, dtype: object"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# eliminate descriptions with less than 6 symbols\n",
"subset_data = preprocessed_data.loc[preprocessed_data['len'] > 5, 'entry'].copy()\n",
"\n",
"#start_val = subset_data.iloc[:20].copy()\n",
"start_val = subset_data.copy()\n",
"start_val"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "bd94dbbe-cfd3-45f2-8b8e-127103fce2f1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Start building embedding map...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████| 6787/6787 [06:08<00:00, 18.43it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Embedding map built successfully.\n",
"INFO:ihm_analyse.preprocess:Start calculation of similarity scores...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|███████████████████████████████████████████████████████████████████| 23028291/23028291 [18:00<00:00, 21305.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity scores calculated successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl\n",
"INFO:ihm_analyse.preprocess:Start gathering of similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████| 9331/9331 [00:03<00:00, 2737.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates gathered successfully.\n",
"INFO:ihm_analyse.preprocess:Saving similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates saved successfully to >>./results/Export4/PipeStep_3_CosSim-FiltCand.xlsx<<.\n",
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"dupl_idx_pairs, embds = pipe_embds.run(starting_values=(start_val,))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "dbf4b2be-2486-4859-a6e8-541fadfe6e6f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-1_build_cosSim_matrix')\n",
"#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix')"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "91edbe35-1694-4f22-a777-1a1b6577a807",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9331"
]
},
"execution_count": 175,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dupl_idx_pairs)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "47cee17a-9beb-49c0-97d1-d6380942cf16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'./results/Export4/dupl_idx_pairs.pkl'"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_idx_pairs = SAVE_PATH_FOLDER + 'dupl_idx_pairs.pkl'\n",
"path_to_idx_pairs"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "284bab2e-e704-4f75-9599-0d80e7f98894",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"#save_pickle(obj=dupl_idx_pairs, path=path_to_idx_pairs)\n",
"dupl_idx_pairs = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='dupl_idx_pairs')"
]
},
{
"cell_type": "markdown",
"id": "88de908f-9da7-4baf-9f34-61b0cd435c6b",
"metadata": {},
"source": [
"## Merge Duplicate Candidates"
]
},
{
"cell_type": "code",
"execution_count": 176,
"id": "1e7a759b-7899-44e1-9673-7aed87db57fd",
"metadata": {},
"outputs": [],
"source": [
"prep_data = preprocessed_data.copy()\n",
"\n",
"pipe_3 = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "2cde0c91-9847-4230-a0f3-89e292810e07",
"metadata": {},
"outputs": [],
"source": [
"pipe_3.add(merge_similarity_dupl, save_result=True)"
]
},
{
"cell_type": "code",
"execution_count": 178,
"id": "cba685ba-8e8c-4890-8df6-652753346e5d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline...\n",
"INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████| 9331/9331 [00:00<00:00, 10511.31it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n"
]
}
],
"source": [
"# merge duplicate candidates\n",
"# info needed: preprocessed data + idx pairs of duplicate candidates\n",
"ret = pipe_3.run(starting_values=(prep_data, dupl_idx_pairs))"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "9f8b13ca-eeaa-46a0-b791-10d227d25db8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>2163</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2681</th>\n",
" <td>vom Eisenkernvorrichtung (Teil vom Kettenlauf ...</td>\n",
" <td>136</td>\n",
" <td>1</td>\n",
" <td>[515]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2680</th>\n",
" <td>Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...</td>\n",
" <td>260</td>\n",
" <td>1</td>\n",
" <td>[311]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5090 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n",
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2681 [515] 1 \n",
"2680 [311] 1 \n",
"2679 [415] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"\n",
"[5090 rows x 5 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret"
]
},
{
"cell_type": "markdown",
"id": "9b1df142-f790-41e2-9039-d6c467957fd4",
"metadata": {},
"source": [
"# End Preprocessing"
]
},
{
"cell_type": "markdown",
"id": "e5728951-4515-48b1-b9c1-5694dcae2ba4",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}