{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c75d375c-3fe4-4bf4-a987-f2ceb5f98072", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 5, "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from ihm_analyse import CONFIG\n", "from ihm_analyse.lib.preprocess import (\n", " load_raw_data,\n", " remove_duplicates,\n", " remove_NA,\n", " clean_string_slim,\n", " entry_wise_cleansing,\n", " analyse_feature,\n", " build_cosSim_matrix,\n", " filt_thresh_cosSim_matrix,\n", " list_cosSim_dupl_candidates,\n", " merge_similarity_dupl,\n", ")\n", "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n", "from ihm_analyse.lib.helpers import (\n", " save_pickle, \n", " load_pickle, \n", " create_saving_folder,\n", " load_toml_config,\n", ")\n", "\n", "from sentence_transformers import SentenceTransformer\n", "import spacy\n", "from pathlib import Path" ] }, { "cell_type": "markdown", "id": "b162b112-a6f6-42a9-9929-19fc32ba181c", "metadata": {}, "source": [ "# Preprocessing\n", "\n", "## Whole Dataset" ] }, { "cell_type": "code", "execution_count": 13, "id": "c797bcf3-a982-4717-b654-ae8802420a99", "metadata": {}, "outputs": [], "source": [ "# constants and other pre-defined variables\n", "DATA_SET_ID = 'Export4'\n", "FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'\n", "date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']\n", "\n", "SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'\n", "create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)\n", "\n", "path_raw_data = Path(FILE_PATH)" ] }, { "cell_type": "code", "execution_count": 14, "id": "994ce600-198c-4e2c-977b-b4af1b77766b", "metadata": {}, "outputs": [], "source": [ "pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)\n", "pipe.add(load_raw_data, {'date_cols': date_cols})\n", "pipe.add(remove_duplicates)\n", "pipe.add(remove_NA, save_result=True)\n", "#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})\n", "pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})\n", "pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)\n", "#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})" ] }, { "cell_type": "code", "execution_count": 15, "id": "e83b6de7-d2a1-4b45-ae16-50347ca677ef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe" ] }, { "cell_type": "code", "execution_count": 16, "id": "45f67843-e44d-4168-843c-762d84f66284", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Starting processing pipeline...\n", "INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n", "INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20\n", "INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84\n", "INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.\n", "INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20\n", "INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.\n", "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl\n", "INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<\n", "INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n" ] } ], "source": [ "ret = pipe.run(starting_values=(path_raw_data,))" ] }, { "cell_type": "code", "execution_count": 17, "id": "733539d8-48e7-40bd-b412-65037514f064", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entrylennum_occurassoc_obj_idsnum_assoc_obj_ids
162Tägliche Wartungstätigkeiten nach Vorgabe des ...6692592[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...206
33Wöchentliche Sichtkontrolle / Reinigung391654[301, 304, 305, 313, 314, 331, 332, 510, 511, ...18
131Tägliche Überprüfung der Ölabscheider371616[0, 970, 2134, 2137]4
160Wöchentliche Kontrolle der WC-Anlagen371265[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...11
140Halbjährliche Kontrolle des Stabbreithalters44687[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...166
..................
2679Zahnräder der Laufkatze verschlissen Ersatztei...1701[415]1
2678Bitte 8 Scheiben nach Muster anfertigen. Danke.481[140]1
2677Schalter für Bühne Schwenken abgerissen, bitte...1261[323]1
2676Docke angefahren!171[176]1
6799Befestigung Deckel für Batteriefach defekt ...1071[326]1
\n", "

6800 rows × 5 columns

\n", "
" ], "text/plain": [ " entry len num_occur \\\n", "162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n", "33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n", "131 Tägliche Überprüfung der Ölabscheider 37 1616 \n", "160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n", "140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n", "... ... ... ... \n", "2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n", "2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n", "2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n", "2676 Docke angefahren! 17 1 \n", "6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n", "\n", " assoc_obj_ids num_assoc_obj_ids \n", "162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n", "33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n", "131 [0, 970, 2134, 2137] 4 \n", "160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n", "140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n", "... ... ... \n", "2679 [415] 1 \n", "2678 [140] 1 \n", "2677 [323] 1 \n", "2676 [176] 1 \n", "6799 [326] 1 \n", "\n", "[6800 rows x 5 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret[0]" ] }, { "cell_type": "code", "execution_count": 171, "id": "a4e2d4c3-8832-45cd-b90b-6065714f6bce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Loaded file successfully.\n", "INFO:ihm_analyse.helpers:Loaded file successfully.\n" ] } ], "source": [ "# intermediate load: loading of intermediate results\n", "ret = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')\n", "pre_1 = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-5_analyse_feature')\n", "preprocessed_data = pre_1[0]\n", "#ret = intermediate_load(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')" ] }, { "cell_type": "code", "execution_count": 172, "id": "23590b29-e31e-4bfd-bc8c-d9c23dd2e363", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entrylennum_occurassoc_obj_idsnum_assoc_obj_ids
162Tägliche Wartungstätigkeiten nach Vorgabe des ...6692592[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...206
33Wöchentliche Sichtkontrolle / Reinigung391654[301, 304, 305, 313, 314, 331, 332, 510, 511, ...18
131Tägliche Überprüfung der Ölabscheider371616[0, 970, 2134, 2137]4
160Wöchentliche Kontrolle der WC-Anlagen371265[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...11
140Halbjährliche Kontrolle des Stabbreithalters44687[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...166
..................
2679Zahnräder der Laufkatze verschlissen Ersatztei...1701[415]1
2678Bitte 8 Scheiben nach Muster anfertigen. Danke.481[140]1
2677Schalter für Bühne Schwenken abgerissen, bitte...1261[323]1
2676Docke angefahren!171[176]1
6799Befestigung Deckel für Batteriefach defekt ...1071[326]1
\n", "

6800 rows × 5 columns

\n", "
" ], "text/plain": [ " entry len num_occur \\\n", "162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n", "33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n", "131 Tägliche Überprüfung der Ölabscheider 37 1616 \n", "160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n", "140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n", "... ... ... ... \n", "2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n", "2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n", "2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n", "2676 Docke angefahren! 17 1 \n", "6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n", "\n", " assoc_obj_ids num_assoc_obj_ids \n", "162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n", "33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n", "131 [0, 970, 2134, 2137] 4 \n", "160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n", "140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n", "... ... ... \n", "2679 [415] 1 \n", "2678 [140] 1 \n", "2677 [323] 1 \n", "2676 [176] 1 \n", "6799 [326] 1 \n", "\n", "[6800 rows x 5 columns]" ] }, "execution_count": 172, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessed_data" ] }, { "cell_type": "markdown", "id": "09fd738a-ee8e-447e-8c3c-7de8566cca32", "metadata": {}, "source": [ "## Embeddings" ] }, { "cell_type": "code", "execution_count": 56, "id": "946e29e0-6253-4c30-9c96-03d3b721abca", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n", "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n" ] } ], "source": [ "SIMILARITY_THRESHOLD = CONFIG['preprocess']['cosine_similarity_threshold']\n", "FILENAME_COSSIM_CANDFILT_WHOLE = 'CosSim-FiltCand'\n", "\n", "pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)\n", "nlp = spacy.load('de_dep_news_trf')\n", "model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')" ] }, { "cell_type": "code", "execution_count": 57, "id": "c7c3375a-cc8b-4100-84e7-9608074aab2d", "metadata": {}, "outputs": [], "source": [ "pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)\n", "pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': SIMILARITY_THRESHOLD}, save_result=True)\n", "pipe_embds.add(\n", " list_cosSim_dupl_candidates, \n", " {'save_candidates': True, \n", " 'saving_path': SAVE_PATH_FOLDER,\n", " 'filename': FILENAME_COSSIM_CANDFILT_WHOLE,\n", " 'pipeline': pipe_embds}, save_result=True)" ] }, { "cell_type": "code", "execution_count": 58, "id": "ff1ac9cd-66fd-4ec8-bca2-f48b13e5c943", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "162 Tägliche Wartungstätigkeiten nach Vorgabe des ...\n", "33 Wöchentliche Sichtkontrolle / Reinigung\n", "131 Tägliche Überprüfung der Ölabscheider\n", "160 Wöchentliche Kontrolle der WC-Anlagen\n", "140 Halbjährliche Kontrolle des Stabbreithalters\n", " ... \n", "2679 Zahnräder der Laufkatze verschlissen Ersatztei...\n", "2678 Bitte 8 Scheiben nach Muster anfertigen. Danke.\n", "2677 Schalter für Bühne Schwenken abgerissen, bitte...\n", "2676 Docke angefahren!\n", "6799 Befestigung Deckel für Batteriefach defekt ...\n", "Name: entry, Length: 6787, dtype: object" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# eliminate descriptions with less than 6 symbols\n", "subset_data = preprocessed_data.loc[preprocessed_data['len'] > 5, 'entry'].copy()\n", "\n", "#start_val = subset_data.iloc[:20].copy()\n", "start_val = subset_data.copy()\n", "start_val" ] }, { "cell_type": "code", "execution_count": 59, "id": "bd94dbbe-cfd3-45f2-8b8e-127103fce2f1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Starting processing pipeline...\n", "INFO:ihm_analyse.preprocess:Start building embedding map...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████| 6787/6787 [06:08<00:00, 18.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.preprocess:Embedding map built successfully.\n", "INFO:ihm_analyse.preprocess:Start calculation of similarity scores...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "100%|███████████████████████████████████████████████████████████████████| 23028291/23028291 [18:00<00:00, 21305.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.preprocess:Similarity scores calculated successfully.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl\n", "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl\n", "INFO:ihm_analyse.preprocess:Start gathering of similarity candidates...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████| 9331/9331 [00:03<00:00, 2737.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.preprocess:Similarity candidates gathered successfully.\n", "INFO:ihm_analyse.preprocess:Saving similarity candidates...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.preprocess:Similarity candidates saved successfully to >>./results/Export4/PipeStep_3_CosSim-FiltCand.xlsx<<.\n", "INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n" ] } ], "source": [ "dupl_idx_pairs, embds = pipe_embds.run(starting_values=(start_val,))" ] }, { "cell_type": "code", "execution_count": 35, "id": "dbf4b2be-2486-4859-a6e8-541fadfe6e6f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Loaded file successfully.\n", "INFO:ihm_analyse.helpers:Loaded file successfully.\n" ] } ], "source": [ "#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-1_build_cosSim_matrix')\n", "#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix')" ] }, { "cell_type": "code", "execution_count": 175, "id": "91edbe35-1694-4f22-a777-1a1b6577a807", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9331" ] }, "execution_count": 175, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dupl_idx_pairs)" ] }, { "cell_type": "code", "execution_count": 166, "id": "47cee17a-9beb-49c0-97d1-d6380942cf16", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'./results/Export4/dupl_idx_pairs.pkl'" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path_to_idx_pairs = SAVE_PATH_FOLDER + 'dupl_idx_pairs.pkl'\n", "path_to_idx_pairs" ] }, { "cell_type": "code", "execution_count": 174, "id": "284bab2e-e704-4f75-9599-0d80e7f98894", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Loaded file successfully.\n" ] } ], "source": [ "#save_pickle(obj=dupl_idx_pairs, path=path_to_idx_pairs)\n", "dupl_idx_pairs = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='dupl_idx_pairs')" ] }, { "cell_type": "markdown", "id": "88de908f-9da7-4baf-9f34-61b0cd435c6b", "metadata": {}, "source": [ "## Merge Duplicate Candidates" ] }, { "cell_type": "code", "execution_count": 176, "id": "1e7a759b-7899-44e1-9673-7aed87db57fd", "metadata": {}, "outputs": [], "source": [ "prep_data = preprocessed_data.copy()\n", "\n", "pipe_3 = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)" ] }, { "cell_type": "code", "execution_count": 177, "id": "2cde0c91-9847-4230-a0f3-89e292810e07", "metadata": {}, "outputs": [], "source": [ "pipe_3.add(merge_similarity_dupl, save_result=True)" ] }, { "cell_type": "code", "execution_count": 178, "id": "cba685ba-8e8c-4890-8df6-652753346e5d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Starting processing pipeline...\n", "INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████| 9331/9331 [00:00<00:00, 10511.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n", "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n" ] } ], "source": [ "# merge duplicate candidates\n", "# info needed: preprocessed data + idx pairs of duplicate candidates\n", "ret = pipe_3.run(starting_values=(prep_data, dupl_idx_pairs))" ] }, { "cell_type": "code", "execution_count": 75, "id": "9f8b13ca-eeaa-46a0-b791-10d227d25db8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entrylennum_occurassoc_obj_idsnum_assoc_obj_ids
162Tägliche Wartungstätigkeiten nach Vorgabe des ...6692592[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...206
33Wöchentliche Sichtkontrolle / Reinigung392163[301, 304, 305, 313, 314, 323, 329, 331, 332, ...27
131Tägliche Überprüfung der Ölabscheider371619[0, 970, 2134, 2137]4
160Wöchentliche Kontrolle der WC-Anlagen371265[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...11
140Halbjährliche Kontrolle des Stabbreithalters44687[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...166
..................
2681vom Eisenkernvorrichtung (Teil vom Kettenlauf ...1361[515]1
2680Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...2601[311]1
2679Zahnräder der Laufkatze verschlissen Ersatztei...1701[415]1
2677Schalter für Bühne Schwenken abgerissen, bitte...1261[323]1
2676Docke angefahren!171[176]1
\n", "

5090 rows × 5 columns

\n", "
" ], "text/plain": [ " entry len num_occur \\\n", "162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n", "33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n", "131 Tägliche Überprüfung der Ölabscheider 37 1619 \n", "160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n", "140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n", "... ... ... ... \n", "2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n", "2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n", "2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n", "2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n", "2676 Docke angefahren! 17 1 \n", "\n", " assoc_obj_ids num_assoc_obj_ids \n", "162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n", "33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n", "131 [0, 970, 2134, 2137] 4 \n", "160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n", "140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n", "... ... ... \n", "2681 [515] 1 \n", "2680 [311] 1 \n", "2679 [415] 1 \n", "2677 [323] 1 \n", "2676 [176] 1 \n", "\n", "[5090 rows x 5 columns]" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret" ] }, { "cell_type": "markdown", "id": "9b1df142-f790-41e2-9039-d6c467957fd4", "metadata": {}, "source": [ "# End Preprocessing" ] }, { "cell_type": "markdown", "id": "e5728951-4515-48b1-b9c1-5694dcae2ba4", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }