# **Analyse 2-3**

## Weiterführung Duplikatfindung mit Sentence-Transformer

## Analyse

In [9]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import spacy
import sentence_transformers
from sentence_transformers import SentenceTransformer
from spacy.lang.de import German as GermanSpacyModel
from collections import Counter
from itertools import combinations
from dateutil.parser import parse
import re

import logging
import sys
import pickle


from ihm_analyze.helpers import (
    save_pickle,
    load_pickle,
    build_embedding_map,
    build_cosSim_matrix,
    filt_thresh_cosSim_matrix,
    list_cosSim_dupl_candidates,
    choose_cosSim_dupl_candidates,
)

LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('base')

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
LOAD_CALC_FILES = False

DESC_BLACKLIST = set(['-'])
"""
GENERAL_BLACKLIST = set([
    'herr', 'hr.', 'förster', 'graf', 'stöppel', 
    'stab', 'kw', 'h.', 'koch', 'heininger', '.',
    'schwab', 'm.', 'wenninger', '-', '--',
])
"""

GENERAL_BLACKLIST = set([
    'herr', 'hr.' 'kw', 'h.', '.',
    'm.', '-', '--', 'dr.', 'dr',
])

#GENERAL_BLACKLIST = set()
#POS_of_interest = set(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
POS_of_interest = set(['NOUN', 'ADJ', 'VERB', 'AUX'])
TAG_of_interest = set(['ADJD'])

In [12]:
# load language model
# transformer model without vector embeddings
# can not be used to calculate similarities
# using sentence transformers instead
nlp = spacy.load('de_dep_news_trf')
#nlp = spacy.load('de_core_news_lg')

In [13]:
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


In [17]:
# load dataframe from duplicate detection
FILE_PATH_TEMP1 = './02_1_Preprocess1/01_DF_num_occur_temp1.parquet'
FILE_PATH_TEMP2 = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'
temp1 = pd.read_parquet(FILE_PATH_TEMP1)
temp2 = pd.read_pickle(FILE_PATH_TEMP2)

In [15]:
temp1

Unnamed: 0,descr,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,Wöchentliche Sichtkontrolle / Reinigung,39,1654,"[301, 304, 305, 313, 314, 331, 332, 510, 511, ...",18
131,Tägliche Überprüfung der Ölabscheider,37,1616,"[0, 970, 2134, 2137]",4
160,Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...
2679,Zahnräder der Laufkatze verschlissen Ersatztei...,170,1,[415],1
2678,Bitte 8 Scheiben nach Muster anfertigen. Danke.,48,1,[140],1
2677,"Schalter für Bühne Schwenken abgerissen, bitte...",126,1,[323],1
2676,Docke angefahren!,17,1,[176],1


In [18]:
temp2

Unnamed: 0,descr,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,Wöchentliche Sichtkontrolle / Reinigung,39,2163,"[301, 304, 305, 313, 314, 323, 329, 331, 332, ...",27
131,Tägliche Überprüfung der Ölabscheider,37,1619,"[0, 970, 2134, 2137]",4
160,Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...
2681,vom Eisenkernvorrichtung (Teil vom Kettenlauf ...,136,1,[515],1
2680,Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...,260,1,[311],1
2679,Zahnräder der Laufkatze verschlissen Ersatztei...,170,1,[415],1
2677,"Schalter für Bühne Schwenken abgerissen, bitte...",126,1,[323],1


# data for model training
data = temp1.iloc[50:300,0].to_list()
data = [e for e in data if e != '']

with open('spacy_train/training_data_2.txt','w', encoding='utf-8') as f:
    f.writelines("\n".join(data))

---

*Load Adjacency Matrix*
- built in ``Analyse_4-1``

In [19]:
SAVE_PATH_ADJ_DF = './02_1_Preprocess1/04_2_adj_mat_df.parquet'
SAVE_PATH_ADJ_DF_UNDIR = './02_1_Preprocess1/04_2_adj_mat_df_undir.parquet'

adj_mat = pd.read_parquet(SAVE_PATH_ADJ_DF)
adj_mat_undir = pd.read_parquet(SAVE_PATH_ADJ_DF_UNDIR)

In [20]:
adj_mat_undir

Unnamed: 0,Motordrehzahl,frieren,Klimaschächte,Massname,CampenAufwickler,Hängekästchen,Schutzbügel,muss,Endlagensensor,Kameralinse,...,Büroraum,Warten,Fahrens,Handregler,PM,Minute,Auffangkorb,Deaktivierung,Fachböden,Angebot
Motordrehzahl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
frieren,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Klimaschächte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Massname,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CampenAufwickler,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Minute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Auffangkorb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
Deaktivierung,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fachböden,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
adj_mat_idx_lst = adj_mat_undir.index.to_list()

In [23]:
len(adj_mat_idx_lst)

6468

Find similar words to group them together

In [82]:
# test word embeddings to find similarities (e.g. Prüfung, prüfen, Überprüfung)
batch = [
    'Prüfung',
    'Anlage',
    'Überprüfung der Maschine',
    'Überprüfung',
    'prüfen',
    'Herr',
    'Datum',
]

In [83]:
#batch = adj_mat_idx_lst.copy()

In [84]:
embds_words = model_stfr.encode(batch, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:00<00:00, 11.76it/s]


In [85]:
assert len(embds_words) == len(adj_mat_idx_lst)

AssertionError: 

In [86]:
ret = sentence_transformers.util.cos_sim(embds_words, embds_words)

In [87]:
data = ret.numpy().copy()
np.fill_diagonal(data, 0)
data = np.triu(data)

In [88]:
cosSim_words_df = pd.DataFrame(data=data, index=range(len(batch)), columns=range(len(batch)))

In [89]:
cosSim_words_df

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.301206,0.37493,0.616439,0.840472,0.291861,0.156846
1,0.0,0.0,0.167375,0.269911,0.260174,0.144282,0.124062
2,0.0,0.0,0.0,0.610566,0.292862,0.193036,0.12131
3,0.0,0.0,0.0,0.0,0.476879,0.238001,0.139318
4,0.0,0.0,0.0,0.0,0.0,0.30144,0.153496
5,0.0,0.0,0.0,0.0,0.0,0.0,0.184479
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
COSSIM_WORDS_THRESHOLD = 0.4
arr = adj_mat_undir.to_numpy()
arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)

In [31]:
# build mapping
word_mapping = dict()

for idx, entry in enumerate(batch):
    word_mapping[idx] = entry

In [54]:
COSSIM_WORD_THRESH = 0.9
ret_thresh = filt_thresh_cosSim_matrix(cosineSim_idx_matrix=cosSim_words_df, threshold=COSSIM_WORD_THRESH)

In [55]:
ret_thresh

2     4505    0.961571
4     2392    0.952447
15    6057    0.948648
21    3218    0.942368
38    6171    1.000000
                ...   
5858  6184    1.000000
5931  6053    1.000000
6056  6134    0.926162
6328  6425    1.000000
6350  6446    1.000000
Length: 618, dtype: float32

In [63]:
word_mapping[6056]

'Deckenplatte'

In [62]:
word_mapping[6134]

'Deckplatte'

Threshold

In [161]:
WEIGHT_THRESHOLD = 5
arr = adj_mat_undir.to_numpy()
arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)

In [162]:
np.count_nonzero(arr)

2916

In [163]:
temp = np.sum(arr, axis=0)
np.count_nonzero(temp)

903

In [164]:
thresh_adj_mat = adj_mat_undir.copy()
thresh_adj_mat.loc[:] = arr

In [165]:
thresh_adj_mat

Unnamed: 0,Motordrehzahl,frieren,Klimaschächte,Massname,CampenAufwickler,Hängekästchen,Schutzbügel,muss,Endlagensensor,Kameralinse,...,Büroraum,Warten,Fahrens,Handregler,PM,Minute,Auffangkorb,Deaktivierung,Fachböden,Angebot
Motordrehzahl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
frieren,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Klimaschächte,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Massname,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CampenAufwickler,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Minute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Auffangkorb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Deaktivierung,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fachböden,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
ADJ_MAT_PATH_CSV = f'./02_2_Preprocess2/20240306_adj_mat_thresh_mapping_{WEIGHT_THRESHOLD}.csv'
thresh_adj_mat.to_csv(path_or_buf=ADJ_MAT_PATH_CSV, encoding='cp1252', sep=';')

---

# BERTopic

In [34]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
 
#docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
#model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


- docs: list of texts to analyse

If you want to use your own embeddings, use it as follows:

        ``python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups
        from sentence_transformers import SentenceTransformer

        # Create embeddings
        docs = fetch_20newsgroups(subset='all')['data']
        sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = sentence_model.encode(docs, show_progress_bar=True)

        # Create topic model
        topic_model = BERTopic()
        topics, probs = topic_model.fit_transform(docs, embeddings)``

In [105]:
# transform all descriptions as a collection to list
descriptions = temp1['descr'].to_list()
description_batch = descriptions[:10]
description_batch = descriptions.copy()

In [106]:
len(description_batch)

6790

In [111]:
assert len(descriptions_w_repetition) == num_occur_total
assert len(descriptions_wo_stopwords_repetition) == num_occur_total

In [127]:
LOAD_CALC_FILES = True
LOAD_CALC_REP_FILES = True
SAVING_CALC_FILES = False
SAVING_CALC_REP_FILES = False

In [128]:
# eliminate stop words from entries
if not LOAD_CALC_FILES:
    descriptions_wo_stopwords = list()

    for text in description_batch:
        doc = nlp(text)
        ret = [token.text for token in doc if not token.is_stop]
        concat = ' '.join(ret)
        
        descriptions_wo_stopwords.append(concat)

In [129]:
# calculate embeddings
#embds = model_stfr.encode(description_batch, show_progress_bar=True)

# repetition dataset too large, model on CPU using approx. 4 hours
#embds_rep = model_stfr.encode(descriptions_w_repetition, show_progress_bar=True)

In [130]:
# save
SAVE_PATH_EMBEDDINGS = './TopicModelling/embds.npy'
SAVE_PATH_EMBEDDINGS_REP = './TopicModelling/embds_rep.npy'
SAVE_PATH_WO_STOPWORDS = './TopicModelling/descr_wo_stopwords.pkl'
SAVE_PATH_WO_STOPWORDS_REP = './TopicModelling/descr_wo_stopwords_rep.pkl'
SAVE_PATH_WHOLE_REP = './TopicModelling/descr_whole_rep.pkl'
if SAVING_CALC_FILES:
    np.save(SAVE_PATH_EMBEDDINGS, embds)
    save_pickle(obj=descriptions_wo_stopwords, path=SAVE_PATH_WO_STOPWORDS)
if SAVING_CALC_REP_FILES:
    #np.save(SAVE_PATH_EMBEDDINGS_REP, embds_rep)
    save_pickle(obj=descriptions_wo_stopwords_repetition, path=SAVE_PATH_WO_STOPWORDS_REP)
    save_pickle(obj=descriptions_w_repetition, path=SAVE_PATH_WHOLE_REP)

In [131]:
# load
if LOAD_CALC_FILES:
    print('loading...')
    embds = np.load(SAVE_PATH_EMBEDDINGS)
    #embds_rep = np.load(SAVE_PATH_EMBEDDINGS_REP)
    descriptions_wo_stopwords = load_pickle(path=SAVE_PATH_WO_STOPWORDS)
    print('loaded')
if LOAD_CALC_REP_FILES:
    print('loading...')
    descriptions_wo_stopwords_repetition = load_pickle(path=SAVE_PATH_WO_STOPWORDS_REP)
    descriptions_w_repetition = load_pickle(path=SAVE_PATH_WHOLE_REP)
    print('loaded')

loading...
loaded
loading...
loaded


In [70]:
assert len(descriptions_wo_stopwords) == len(description_batch)
assert len(embds) == len(description_batch)
assert len(embds) == len(descriptions_wo_stopwords)

In [126]:
len(embds)

6790

In [None]:
# load duplicate cleaned dataset
SAVE_PATH_REMOVED_DUPL = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'

temp2 = 

In [157]:
ADJ_DF_PATH = './Graphanalyse/adj_mat_df.fth'
adj_mat_undir = pd.read_feather(ADJ_DF_PATH)
adj_mat_undir = adj_mat_undir.set_index('index')

In [158]:
adj_mat_undir

Unnamed: 0_level_0,Verunreinigung,Luftreiniger,bedeckt,Schweikopf,Frostprävention,Mithilfe,Interne,Reinigung,Prüfen,Defekte,...,Visuelle,Rundgang,Rieme,sein,Eigenverantwortlichkeit,Lager,Leckage,werden,Wartungsplan,Monat
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Verunreinigung,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Luftreiniger,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bedeckt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Schweikopf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Frostprävention,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lager,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Leckage,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
werden,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Wartungsplan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---

*Repetition analysis*

In [None]:
temp1['num_occur'].sum()

124008

In [None]:
temp2 = temp1[['descr', 'num_occur']]
#temp2 = temp2.iloc[:10,:]
num_occur_total = temp2['num_occur'].sum()
num_occur_total

124008

In [None]:
# reconstruct dataset with number of occurences for each entry

 if not LOAD_CALC_REP_FILES:
    descriptions_w_repetition = list()
    descriptions_wo_stopwords_repetition = list()

    for idx, entry in enumerate(temp2.itertuples()):
        num_occur = entry.num_occur
        descr_whole = entry.descr
        descr_wo_stopwords = descriptions_wo_stopwords[idx]
        
        descr_whole_rep = [descr_whole] * num_occur
        descr_wo_stopwords_rep = [descr_wo_stopwords] * num_occur
        
        descriptions_w_repetition.extend(descr_whole_rep)
        descriptions_wo_stopwords_repetition.extend(descr_wo_stopwords_rep)

---

In [71]:
topic_model = BERTopic()
#topics, probs = topic_model.fit_transform(description_batch, embds)
topics, probs = topic_model.fit_transform(descriptions_wo_stopwords, embds)

In [72]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1244,-1_bitte_danke_prfen_strung,"[bitte, danke, prfen, strung, herr, defekt, be...",[- Reinigen Gerätes Außen feuchten Reinigungst...
1,0,332,0_docke_dockenwickler_belag_berziehen,"[docke, dockenwickler, belag, berziehen, docke...","[docke, Docke Belag überziehen, docke überzieh..."
2,1,164,1_motor_vortrockner_hauptmotor_servomotor,"[motor, vortrockner, hauptmotor, servomotor, s...","[Vortrockner 1 Motor defekt ., Motor Geräusche..."
3,2,156,2_lager_umlenkwalze_tauschen_umwlzpumpe,"[lager, umlenkwalze, tauschen, umwlzpumpe, kit...","[Lager defekt ., Lager Defekt, Lager defekt ! ..."
4,3,155,3_kbk_stecker_defekt_steigdocke,"[kbk, stecker, defekt, steigdocke, kupplung, b...","[Kabel Stecker defekt, Kabel Stecker defekt, K..."
...,...,...,...,...,...
142,141,11,141_luft_messwalze_mluft_sauglippe,"[luft, messwalze, mluft, sauglippe, reinschaue...","[Sauglippe bewegt, M. läuft . Linke Bedienseit..."
143,142,11,142_paste_gewnschten_anschlagmittel_effekt,"[paste, gewnschten, anschlagmittel, effekt, fh...",[40Stück Gewindebolzen Keramikbremsen anfertig...
144,143,11,143_frostprvention_wrmetauscher_warmwasserhahn...,"[frostprvention, wrmetauscher, warmwasserhahn,...",[Wärmeofen ( Funktion Line ) Hebel öffnen Ofen...
145,144,11,144_auffllen_aschenbecher_desifektionsmittel_l...,"[auffllen, aschenbecher, desifektionsmittel, l...",[Täglicher Rundgang . ( Desifektionsmittel auf...


**Problem:**
- Modell nutzt klasische Stoppwörter mit und verfälscht Ergebnis
- BERTopic-Vorschlag: Nutzung einer Stopwortliste im Tokenizer-Modul der Pipeline
- gewählter Ansatz: Entfernung bereits nach Generierung der Embeddings

- Verfälschung durch Nutzung von zusammengeführtem Datensatz (``num_occur``), fließt nicht mit ein
- Alternative: Rekonstruktion Datensatz mit Anzahl Einträgen --> riesiger Rechenaufwand
    - CPU: Rechenzeit ungefährt 4 Stunden für Embeddings

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [10]:
docs[0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [None]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [32]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,27,-1_reinigung_und_der_von,"[reinigung, und, der, von, berprfung, sichtkon...",[3-Monatliche Reinigung und Prüfung der Kühlge...
1,0,25,0_kontrolle_der_auf_prfen,"[kontrolle, der, auf, prfen, wchentliche, kont...",[Wöchentliche Kontrolle Klimagerät Inneneinhe...
2,1,18,1_siehe_wartungsplan_vorgabe_extradaten,"[siehe, wartungsplan, vorgabe, extradaten, fir...",[Vorgabe aus Wartungsplan Firma Menzel (siehe ...


Test Cosine Similarity
- erstelle Matrix mit Ähnlichkeits-Score (obere Dreiecksmatrix)
- jedes Wortpaar
- filtere Tabelle nach Threshold
- nutze Gewichts-Adjezenzmatrix mit Threshold als Maske
    - nur Analyse von hochgewichtigen Gruppen
- analysiere Zusammenhänge in Form von Graph (ähnlich bisherigem Vorgehen)
- bilde Gruppen und benenne diese (z.B. Prüfung+Überprüfung+Kontrolle --> Überprüfung)
- baue daraus Wörterbuch und matche Begriffe bei der Erstellung

In [49]:
def build_cosine_similarity_matrix(
    adj_mat
):
    # obtain words to compare
    words = adj_mat.index.to_list()
    
    # cos matrix
    cos_mat = pd.DataFrame(
        data=0., 
        columns=words, 
        index=words,
        dtype=np.float32,
    )
    
    for (word1, word2) in combinations(words, 2):
        # obtain model vocabulary
        w1 = nlp.vocab[str(word1)]
        w2 = nlp.vocab[str(word2)]
        # calculate cosine similarity
        cos_sim = w1.similarity(w2)
        # set value
        cos_mat.at[word1, word2] = cos_sim
        
    return cos_mat

In [50]:
cos_mat = build_cosine_similarity_matrix(adj_mat=adj_mat_undir)

  cos_sim = w1.similarity(w2)


In [52]:
cos_mat

Unnamed: 0,Klübertemp,Schusssuche,Laser,Schaftteile,Dichtsätz,Tastatur,Vorspuleinheit,beginnen,auslesen,Kettspannung,...,Tänzerwalze,Abfallkante,rappeln,Rottenegger,Contrawalze,Eisenträger,Hängegurte,Treffen,Greiferarmen,Nadelleist
Klübertemp,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Schusssuche,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Laser,0.0,0.0,0.0,0.0,0.0,0.324276,0.0,0.059743,0.133676,0.0,...,0.0,0.0,-0.063913,0.0,0.0,0.167521,0.0,-0.029860,0.0,0.0
Schaftteile,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Dichtsätz,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Eisenträger,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.170954,0.0,0.0
Hängegurte,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Treffen,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Greiferarmen,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0


In [635]:
WEIGHT_THRESHOLD = 10
arr = adj_mat_undir.to_numpy()
COS_THRESHOLD = 0.4
cos_arr = cos_mat.to_numpy()

In [636]:
cos_arr_filt = np.where((cos_arr > COS_THRESHOLD) & (arr >= WEIGHT_THRESHOLD), cos_arr, 0)

In [637]:
cos_arr_filt

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [638]:
np.count_nonzero(cos_arr_filt)

217

In [639]:
thresh_cos_mat = cos_mat.copy()
thresh_cos_mat[:] = cos_arr_filt

In [640]:
thresh_cos_mat

Unnamed: 0,Verstärkung,Zuluftfilter,klemmt,Komminikation,Doppelholztische,Deckenbeleuchtung,Abfalltransport,fahrbar,Folieneinlauf,entsorgen,...,neuwertig,Bleit,Rauchentwicklung,Kompressorsteuerung,anziehen,Mitarbeiterin,Nägel,WZ,ExSchutzAnlage,Gemisch
Verstärkung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zuluftfilter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
klemmt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Komminikation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doppelholztische,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mitarbeiterin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Nägel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
WZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ExSchutzAnlage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [641]:
COS_MAT_PATH_CSV = f'./Graphanalyse_Gruppen/cos_mat_Wthresh_{WEIGHT_THRESHOLD}_Cthresh{int(COS_THRESHOLD*100)}.csv'
thresh_cos_mat.to_csv(path_or_buf=COS_MAT_PATH_CSV, encoding='cp1252', sep=';')