In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lang_main import CONFIG
from lang_main.lib.preprocess import (
    load_raw_data,
    remove_duplicates,
    remove_NA,
    clean_string_slim,
    entry_wise_cleansing,
    analyse_feature,
    build_cosSim_matrix,
    filt_thresh_cosSim_matrix,
    list_cosSim_dupl_candidates,
    merge_similarity_dupl,
)
from lang_main.pipelines import BasePipeline, EmbeddingPipeline
from lang_main.lib.helpers import (
    save_pickle, 
    load_pickle, 
    create_saving_folder,
    load_toml_config,
)

from sentence_transformers import SentenceTransformer
import spacy
from pathlib import Path

ModuleNotFoundError: No module named 'ihm_analyse'

# Preprocessing

## Whole Dataset

In [13]:
# constants and other pre-defined variables
DATA_SET_ID = 'Export4'
FILE_PATH = f'./01_2_Rohdaten_neu/{DATA_SET_ID}.csv'
date_cols = ['VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum']

SAVE_PATH_FOLDER = f'./results/{DATA_SET_ID}/'
create_saving_folder(saving_path_folder=SAVE_PATH_FOLDER)

path_raw_data = Path(FILE_PATH)

In [14]:
pipe = BasePipeline(name='Preprocess1', working_dir=SAVE_PATH_FOLDER)
pipe.add(load_raw_data, {'date_cols': date_cols})
pipe.add(remove_duplicates)
pipe.add(remove_NA, save_result=True)
#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'WO-dupl', 'pipeline': pipe})
pipe.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
pipe.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
#pipe.add(intermediate_save, {'saving_path': SAVE_PATH_FOLDER, 'filename': 'analyse-feature', 'pipeline': pipe})

In [15]:
pipe

BasePipeline(name: Preprocess1, working dir: ./results/Export4/, contents: ['load_raw_data', 'remove_duplicates', 'remove_NA', 'entry_wise_cleansing', 'analyse_feature'])

In [16]:
ret = pipe.run(starting_values=(path_raw_data,))

INFO:ihm_analyse.pipelines:Starting processing pipeline...
INFO:ihm_analyse.preprocess:Loaded dataset successfully.
INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 129020, number of features 20
INFO:ihm_analyse.preprocess:Number of duplicates over all features: 84
INFO:ihm_analyse.preprocess:Removed duplicates from dataset successfully.
INFO:ihm_analyse.preprocess:New Dataset properties: number of entries: 128936, number of features 20
INFO:ihm_analyse.preprocess:Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.
INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-3_remove_NA.pkl
INFO:ihm_analyse.preprocess:Successfully applied entry-wise cleansing procedure >>clean_string_slim<< for feature >>VorgangsBeschreibung<<
INFO:ihm_analyse.preprocess:Number of entries for feature >>VorgangsBeschreibung<<: 124008


100%|█████████████████████████████████████████████████████████████████████████████| 6800/6800 [00:37<00:00, 180.32it/s]

INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Preprocess1_Step-5_analyse_feature.pkl





INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.


In [17]:
ret[0]

Unnamed: 0,entry,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,Wöchentliche Sichtkontrolle / Reinigung,39,1654,"[301, 304, 305, 313, 314, 331, 332, 510, 511, ...",18
131,Tägliche Überprüfung der Ölabscheider,37,1616,"[0, 970, 2134, 2137]",4
160,Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...
2679,Zahnräder der Laufkatze verschlissen Ersatztei...,170,1,[415],1
2678,Bitte 8 Scheiben nach Muster anfertigen. Danke.,48,1,[140],1
2677,"Schalter für Bühne Schwenken abgerissen, bitte...",126,1,[323],1
2676,Docke angefahren!,17,1,[176],1


In [171]:
# intermediate load: loading of intermediate results
ret = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')
pre_1 = pipe.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-5_analyse_feature')
preprocessed_data = pre_1[0]
#ret = intermediate_load(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Preprocess1_Step-3_remove_NA')

INFO:ihm_analyse.helpers:Loaded file successfully.
INFO:ihm_analyse.helpers:Loaded file successfully.


In [172]:
preprocessed_data

Unnamed: 0,entry,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,Wöchentliche Sichtkontrolle / Reinigung,39,1654,"[301, 304, 305, 313, 314, 331, 332, 510, 511, ...",18
131,Tägliche Überprüfung der Ölabscheider,37,1616,"[0, 970, 2134, 2137]",4
160,Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...
2679,Zahnräder der Laufkatze verschlissen Ersatztei...,170,1,[415],1
2678,Bitte 8 Scheiben nach Muster anfertigen. Danke.,48,1,[140],1
2677,"Schalter für Bühne Schwenken abgerissen, bitte...",126,1,[323],1
2676,Docke angefahren!,17,1,[176],1


## Embeddings

In [56]:
SIMILARITY_THRESHOLD = CONFIG['preprocess']['cosine_similarity_threshold']
FILENAME_COSSIM_CANDFILT_WHOLE = 'CosSim-FiltCand'

pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
nlp = spacy.load('de_dep_news_trf')
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


In [57]:
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': SIMILARITY_THRESHOLD}, save_result=True)
pipe_embds.add(
    list_cosSim_dupl_candidates, 
    {'save_candidates': True, 
     'saving_path': SAVE_PATH_FOLDER,
     'filename': FILENAME_COSSIM_CANDFILT_WHOLE,
     'pipeline': pipe_embds}, save_result=True)

In [58]:
# eliminate descriptions with less than 6 symbols
subset_data = preprocessed_data.loc[preprocessed_data['len'] > 5, 'entry'].copy()

#start_val = subset_data.iloc[:20].copy()
start_val = subset_data.copy()
start_val

162     Tägliche Wartungstätigkeiten nach Vorgabe des ...
33                Wöchentliche Sichtkontrolle / Reinigung
131                 Tägliche Überprüfung der Ölabscheider
160                 Wöchentliche Kontrolle der WC-Anlagen
140          Halbjährliche Kontrolle des Stabbreithalters
                              ...                        
2679    Zahnräder der Laufkatze verschlissen Ersatztei...
2678     Bitte 8 Scheiben nach Muster anfertigen.  Danke.
2677    Schalter für Bühne Schwenken abgerissen, bitte...
2676                                    Docke angefahren!
6799    Befestigung Deckel für Batteriefach defekt    ...
Name: entry, Length: 6787, dtype: object

In [59]:
dupl_idx_pairs, embds = pipe_embds.run(starting_values=(start_val,))

INFO:ihm_analyse.pipelines:Starting processing pipeline...
INFO:ihm_analyse.preprocess:Start building embedding map...


100%|██████████████████████████████████████████████████████████████████████████████| 6787/6787 [06:08<00:00, 18.43it/s]

INFO:ihm_analyse.preprocess:Embedding map built successfully.
INFO:ihm_analyse.preprocess:Start calculation of similarity scores...



100%|███████████████████████████████████████████████████████████████████| 23028291/23028291 [18:00<00:00, 21305.85it/s]

INFO:ihm_analyse.preprocess:Similarity scores calculated successfully.





INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-1_build_cosSim_matrix.pkl
INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pkl
INFO:ihm_analyse.preprocess:Start gathering of similarity candidates...


100%|████████████████████████████████████████████████████████████████████████████| 9331/9331 [00:03<00:00, 2737.75it/s]

INFO:ihm_analyse.preprocess:Similarity candidates gathered successfully.
INFO:ihm_analyse.preprocess:Saving similarity candidates...





INFO:ihm_analyse.preprocess:Similarity candidates saved successfully to >>./results/Export4/PipeStep_3_CosSim-FiltCand.xlsx<<.
INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.


In [35]:
#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-1_build_cosSim_matrix')
#ret = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix')

INFO:ihm_analyse.helpers:Loaded file successfully.
INFO:ihm_analyse.helpers:Loaded file successfully.


In [175]:
len(dupl_idx_pairs)

9331

In [166]:
path_to_idx_pairs = SAVE_PATH_FOLDER + 'dupl_idx_pairs.pkl'
path_to_idx_pairs

'./results/Export4/dupl_idx_pairs.pkl'

In [174]:
#save_pickle(obj=dupl_idx_pairs, path=path_to_idx_pairs)
dupl_idx_pairs = pipe_embds.load_intermediate_result(saving_path=SAVE_PATH_FOLDER, filename='dupl_idx_pairs')

INFO:ihm_analyse.helpers:Loaded file successfully.


## Merge Duplicate Candidates

In [176]:
prep_data = preprocessed_data.copy()

pipe_3 = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)

In [177]:
pipe_3.add(merge_similarity_dupl, save_result=True)

In [178]:
# merge duplicate candidates
# info needed: preprocessed data + idx pairs of duplicate candidates
ret = pipe_3.run(starting_values=(prep_data, dupl_idx_pairs))

INFO:ihm_analyse.pipelines:Starting processing pipeline...
INFO:ihm_analyse.preprocess:Start merging of similarity candidates...


100%|███████████████████████████████████████████████████████████████████████████| 9331/9331 [00:00<00:00, 10511.31it/s]

INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.
INFO:ihm_analyse.helpers:Saved file successfully under ./results/Export4/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl





INFO:ihm_analyse.pipelines:Processing pipeline successfully ended.


In [75]:
ret

Unnamed: 0,entry,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,Wöchentliche Sichtkontrolle / Reinigung,39,2163,"[301, 304, 305, 313, 314, 323, 329, 331, 332, ...",27
131,Tägliche Überprüfung der Ölabscheider,37,1619,"[0, 970, 2134, 2137]",4
160,Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...
2681,vom Eisenkernvorrichtung (Teil vom Kettenlauf ...,136,1,[515],1
2680,Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...,260,1,[311],1
2679,Zahnräder der Laufkatze verschlissen Ersatztei...,170,1,[415],1
2677,"Schalter für Bühne Schwenken abgerissen, bitte...",126,1,[323],1


# End Preprocessing

---