{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c75d375c-3fe4-4bf4-a987-f2ceb5f98072", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 5, "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from ihm_analyse import CONFIG\n", "from ihm_analyse.lib.preprocess import (\n", " load_raw_data,\n", " remove_duplicates,\n", " remove_NA,\n", " clean_string_slim,\n", " entry_wise_cleansing,\n", " analyse_feature,\n", " build_cosSim_matrix,\n", " filt_thresh_cosSim_matrix,\n", " list_cosSim_dupl_candidates,\n", " merge_similarity_dupl,\n", ")\n", "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n", "from ihm_analyse.lib.helpers import (\n", " save_pickle, \n", " load_pickle, \n", " create_saving_folder,\n", " load_toml_config,\n", ")\n", "\n", "from sentence_transformers import SentenceTransformer\n", "import spacy\n", "from pathlib import Path" ] }, { "cell_type": "markdown", "id": "b162b112-a6f6-42a9-9929-19fc32ba181c", "metadata": {}, "source": [ "# Preprocessing\n", "\n", "## Whole Dataset" ] }, { "cell_type": "code", "execution_count": 13, "id": "c797bcf3-a982-4717-b654-ae8802420a99", "metadata": {}, "outputs": [], "source": [ "# constants and other pre-defined variables\n", "DATA_SET_ID = 'Export4'\n", "FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'\n", "date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']\n", "\n", "SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'\n", "create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)\n", "\n", "path_raw_data = Path(FILE_PATH)" ] }, { "cell_type": "code", "execution_count": 14, "id": "994ce600-198c-4e2c-977b-b4af1b77766b", "metadata": {}, "outputs": [], "source": [ "pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)\n", "pipe.add(load_raw_data, {'date_cols': date_cols})\n", "pipe.add(remove_duplicates)\n", "pipe.add(remove_NA, save_result=True)\n", "#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})\n", "pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})\n", "pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)\n", "#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})" ] }, { "cell_type": "code", "execution_count": 15, "id": "e83b6de7-d2a1-4b45-ae16-50347ca677ef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe" ] }, { "cell_type": "code", "execution_count": 16, "id": "45f67843-e44d-4168-843c-762d84f66284", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Starting processing pipeline...\n", "INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n", "INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20\n", "INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84\n", "INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.\n", "INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20\n", "INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.\n", "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl\n", "INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<\n", "INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.\n" ] } ], "source": [ "ret = pipe.run(starting_values=(path_raw_data,))" ] }, { "cell_type": "code", "execution_count": 17, "id": "733539d8-48e7-40bd-b412-65037514f064", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | entry | \n", "len | \n", "num_occur | \n", "assoc_obj_ids | \n", "num_assoc_obj_ids | \n", "
|---|---|---|---|---|---|
| 162 | \n", "Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n", "66 | \n", "92592 | \n", "[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n", "206 | \n", "
| 33 | \n", "Wöchentliche Sichtkontrolle / Reinigung | \n", "39 | \n", "1654 | \n", "[301, 304, 305, 313, 314, 331, 332, 510, 511, ... | \n", "18 | \n", "
| 131 | \n", "Tägliche Überprüfung der Ölabscheider | \n", "37 | \n", "1616 | \n", "[0, 970, 2134, 2137] | \n", "4 | \n", "
| 160 | \n", "Wöchentliche Kontrolle der WC-Anlagen | \n", "37 | \n", "1265 | \n", "[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n", "11 | \n", "
| 140 | \n", "Halbjährliche Kontrolle des Stabbreithalters | \n", "44 | \n", "687 | \n", "[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n", "166 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2679 | \n", "Zahnräder der Laufkatze verschlissen Ersatztei... | \n", "170 | \n", "1 | \n", "[415] | \n", "1 | \n", "
| 2678 | \n", "Bitte 8 Scheiben nach Muster anfertigen. Danke. | \n", "48 | \n", "1 | \n", "[140] | \n", "1 | \n", "
| 2677 | \n", "Schalter für Bühne Schwenken abgerissen, bitte... | \n", "126 | \n", "1 | \n", "[323] | \n", "1 | \n", "
| 2676 | \n", "Docke angefahren! | \n", "17 | \n", "1 | \n", "[176] | \n", "1 | \n", "
| 6799 | \n", "Befestigung Deckel für Batteriefach defekt ... | \n", "107 | \n", "1 | \n", "[326] | \n", "1 | \n", "
6800 rows × 5 columns
\n", "| \n", " | entry | \n", "len | \n", "num_occur | \n", "assoc_obj_ids | \n", "num_assoc_obj_ids | \n", "
|---|---|---|---|---|---|
| 162 | \n", "Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n", "66 | \n", "92592 | \n", "[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n", "206 | \n", "
| 33 | \n", "Wöchentliche Sichtkontrolle / Reinigung | \n", "39 | \n", "1654 | \n", "[301, 304, 305, 313, 314, 331, 332, 510, 511, ... | \n", "18 | \n", "
| 131 | \n", "Tägliche Überprüfung der Ölabscheider | \n", "37 | \n", "1616 | \n", "[0, 970, 2134, 2137] | \n", "4 | \n", "
| 160 | \n", "Wöchentliche Kontrolle der WC-Anlagen | \n", "37 | \n", "1265 | \n", "[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n", "11 | \n", "
| 140 | \n", "Halbjährliche Kontrolle des Stabbreithalters | \n", "44 | \n", "687 | \n", "[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n", "166 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2679 | \n", "Zahnräder der Laufkatze verschlissen Ersatztei... | \n", "170 | \n", "1 | \n", "[415] | \n", "1 | \n", "
| 2678 | \n", "Bitte 8 Scheiben nach Muster anfertigen. Danke. | \n", "48 | \n", "1 | \n", "[140] | \n", "1 | \n", "
| 2677 | \n", "Schalter für Bühne Schwenken abgerissen, bitte... | \n", "126 | \n", "1 | \n", "[323] | \n", "1 | \n", "
| 2676 | \n", "Docke angefahren! | \n", "17 | \n", "1 | \n", "[176] | \n", "1 | \n", "
| 6799 | \n", "Befestigung Deckel für Batteriefach defekt ... | \n", "107 | \n", "1 | \n", "[326] | \n", "1 | \n", "
6800 rows × 5 columns
\n", "| \n", " | entry | \n", "len | \n", "num_occur | \n", "assoc_obj_ids | \n", "num_assoc_obj_ids | \n", "
|---|---|---|---|---|---|
| 162 | \n", "Tägliche Wartungstätigkeiten nach Vorgabe des ... | \n", "66 | \n", "92592 | \n", "[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... | \n", "206 | \n", "
| 33 | \n", "Wöchentliche Sichtkontrolle / Reinigung | \n", "39 | \n", "2163 | \n", "[301, 304, 305, 313, 314, 323, 329, 331, 332, ... | \n", "27 | \n", "
| 131 | \n", "Tägliche Überprüfung der Ölabscheider | \n", "37 | \n", "1619 | \n", "[0, 970, 2134, 2137] | \n", "4 | \n", "
| 160 | \n", "Wöchentliche Kontrolle der WC-Anlagen | \n", "37 | \n", "1265 | \n", "[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... | \n", "11 | \n", "
| 140 | \n", "Halbjährliche Kontrolle des Stabbreithalters | \n", "44 | \n", "687 | \n", "[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... | \n", "166 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2681 | \n", "vom Eisenkernvorrichtung (Teil vom Kettenlauf ... | \n", "136 | \n", "1 | \n", "[515] | \n", "1 | \n", "
| 2680 | \n", "Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... | \n", "260 | \n", "1 | \n", "[311] | \n", "1 | \n", "
| 2679 | \n", "Zahnräder der Laufkatze verschlissen Ersatztei... | \n", "170 | \n", "1 | \n", "[415] | \n", "1 | \n", "
| 2677 | \n", "Schalter für Bühne Schwenken abgerissen, bitte... | \n", "126 | \n", "1 | \n", "[323] | \n", "1 | \n", "
| 2676 | \n", "Docke angefahren! | \n", "17 | \n", "1 | \n", "[176] | \n", "1 | \n", "
5090 rows × 5 columns
\n", "