lang-main/notebooks/archive/Analyse_4-2.ipynb
2024-08-07 20:06:06 +02:00

3899 lines
127 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Analyse 2-3**\n",
"\n",
"## Weiterführung Duplikatfindung mit Sentence-Transformer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analyse"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"import spacy\n",
"import sentence_transformers\n",
"from sentence_transformers import SentenceTransformer\n",
"from spacy.lang.de import German as GermanSpacyModel\n",
"from collections import Counter\n",
"from itertools import combinations\n",
"from dateutil.parser import parse\n",
"import re\n",
"\n",
"import logging\n",
"import sys\n",
"import pickle\n",
"\n",
"\n",
"from ihm_analyze.helpers import (\n",
" save_pickle,\n",
" load_pickle,\n",
" build_embedding_map,\n",
" build_cosSim_matrix,\n",
" filt_thresh_cosSim_matrix,\n",
" list_cosSim_dupl_candidates,\n",
" choose_cosSim_dupl_candidates,\n",
")\n",
"\n",
"LOGGING_LEVEL = 'INFO'\n",
"logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)\n",
"logger = logging.getLogger('base')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"LOAD_CALC_FILES = False\n",
"\n",
"DESC_BLACKLIST = set(['-'])\n",
"\"\"\"\n",
"GENERAL_BLACKLIST = set([\n",
" 'herr', 'hr.', 'förster', 'graf', 'stöppel', \n",
" 'stab', 'kw', 'h.', 'koch', 'heininger', '.',\n",
" 'schwab', 'm.', 'wenninger', '-', '--',\n",
"])\n",
"\"\"\"\n",
"\n",
"GENERAL_BLACKLIST = set([\n",
" 'herr', 'hr.' 'kw', 'h.', '.',\n",
" 'm.', '-', '--', 'dr.', 'dr',\n",
"])\n",
"\n",
"#GENERAL_BLACKLIST = set()\n",
"#POS_of_interest = set(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])\n",
"POS_of_interest = set(['NOUN', 'ADJ', 'VERB', 'AUX'])\n",
"TAG_of_interest = set(['ADJD'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# load language model\n",
"# transformer model without vector embeddings\n",
"# can not be used to calculate similarities\n",
"# using sentence transformers instead\n",
"nlp = spacy.load('de_dep_news_trf')\n",
"#nlp = spacy.load('de_core_news_lg')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
]
}
],
"source": [
"model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# load dataframe from duplicate detection\n",
"FILE_PATH_TEMP1 = './02_1_Preprocess1/01_DF_num_occur_temp1.parquet'\n",
"FILE_PATH_TEMP2 = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'\n",
"temp1 = pd.read_parquet(FILE_PATH_TEMP1)\n",
"temp2 = pd.read_pickle(FILE_PATH_TEMP2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>descr</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>1654</td>\n",
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1616</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2678</th>\n",
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
" <td>48</td>\n",
" <td>1</td>\n",
" <td>[140]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6799</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>107</td>\n",
" <td>1</td>\n",
" <td>[326]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6800 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" descr len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2679 [415] 1 \n",
"2678 [140] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"6799 [326] 1 \n",
"\n",
"[6800 rows x 5 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp1"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>descr</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>2163</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
" <td>37</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2681</th>\n",
" <td>vom Eisenkernvorrichtung (Teil vom Kettenlauf ...</td>\n",
" <td>136</td>\n",
" <td>1</td>\n",
" <td>[515]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2680</th>\n",
" <td>Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...</td>\n",
" <td>260</td>\n",
" <td>1</td>\n",
" <td>[311]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2679</th>\n",
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
" <td>170</td>\n",
" <td>1</td>\n",
" <td>[415]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2677</th>\n",
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
" <td>126</td>\n",
" <td>1</td>\n",
" <td>[323]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2676</th>\n",
" <td>Docke angefahren!</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>[176]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5090 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" descr len num_occur \\\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n",
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
"... ... ... ... \n",
"2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n",
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
"2676 Docke angefahren! 17 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n",
"131 [0, 970, 2134, 2137] 4 \n",
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
"... ... ... \n",
"2681 [515] 1 \n",
"2680 [311] 1 \n",
"2679 [415] 1 \n",
"2677 [323] 1 \n",
"2676 [176] 1 \n",
"\n",
"[5090 rows x 5 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# data for model training\n",
"data = temp1.iloc[50:300,0].to_list()\n",
"data = [e for e in data if e != '']\n",
"\n",
"with open('spacy_train/training_data_2.txt','w', encoding='utf-8') as f:\n",
" f.writelines(\"\\n\".join(data))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"*Load Adjacency Matrix*\n",
"- built in ``Analyse_4-1``"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"SAVE_PATH_ADJ_DF = './02_1_Preprocess1/04_2_adj_mat_df.parquet'\n",
"SAVE_PATH_ADJ_DF_UNDIR = './02_1_Preprocess1/04_2_adj_mat_df_undir.parquet'\n",
"\n",
"adj_mat = pd.read_parquet(SAVE_PATH_ADJ_DF)\n",
"adj_mat_undir = pd.read_parquet(SAVE_PATH_ADJ_DF_UNDIR)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Motordrehzahl</th>\n",
" <th>frieren</th>\n",
" <th>Klimaschächte</th>\n",
" <th>Massname</th>\n",
" <th>CampenAufwickler</th>\n",
" <th>Hängekästchen</th>\n",
" <th>Schutzbügel</th>\n",
" <th>muss</th>\n",
" <th>Endlagensensor</th>\n",
" <th>Kameralinse</th>\n",
" <th>...</th>\n",
" <th>Büroraum</th>\n",
" <th>Warten</th>\n",
" <th>Fahrens</th>\n",
" <th>Handregler</th>\n",
" <th>PM</th>\n",
" <th>Minute</th>\n",
" <th>Auffangkorb</th>\n",
" <th>Deaktivierung</th>\n",
" <th>Fachböden</th>\n",
" <th>Angebot</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Motordrehzahl</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>frieren</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Klimaschächte</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Massname</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CampenAufwickler</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Minute</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Auffangkorb</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Deaktivierung</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fachböden</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angebot</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6468 rows × 6468 columns</p>\n",
"</div>"
],
"text/plain": [
" Motordrehzahl frieren Klimaschächte Massname \\\n",
"Motordrehzahl 0 0 0 0 \n",
"frieren 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 \n",
"Massname 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 \n",
"... ... ... ... ... \n",
"Minute 0 0 0 0 \n",
"Auffangkorb 0 0 0 0 \n",
"Deaktivierung 0 0 0 0 \n",
"Fachböden 0 0 0 0 \n",
"Angebot 0 0 0 0 \n",
"\n",
" CampenAufwickler Hängekästchen Schutzbügel muss \\\n",
"Motordrehzahl 0 0 0 0 \n",
"frieren 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 \n",
"Massname 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 \n",
"... ... ... ... ... \n",
"Minute 0 0 0 0 \n",
"Auffangkorb 0 0 0 0 \n",
"Deaktivierung 0 0 0 0 \n",
"Fachböden 0 0 0 0 \n",
"Angebot 0 0 0 0 \n",
"\n",
" Endlagensensor Kameralinse ... Büroraum Warten Fahrens \\\n",
"Motordrehzahl 0 0 ... 0 0 0 \n",
"frieren 0 0 ... 0 0 0 \n",
"Klimaschächte 0 0 ... 0 0 0 \n",
"Massname 0 0 ... 0 0 0 \n",
"CampenAufwickler 0 0 ... 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"Minute 0 0 ... 0 0 0 \n",
"Auffangkorb 0 0 ... 0 0 0 \n",
"Deaktivierung 0 0 ... 0 0 0 \n",
"Fachböden 0 0 ... 0 0 0 \n",
"Angebot 0 0 ... 0 0 0 \n",
"\n",
" Handregler PM Minute Auffangkorb Deaktivierung \\\n",
"Motordrehzahl 0 0 0 0 0 \n",
"frieren 0 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 0 \n",
"Massname 0 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 0 \n",
"... ... .. ... ... ... \n",
"Minute 0 0 0 0 0 \n",
"Auffangkorb 0 0 0 4 0 \n",
"Deaktivierung 0 0 0 0 0 \n",
"Fachböden 0 0 0 0 0 \n",
"Angebot 0 0 0 0 0 \n",
"\n",
" Fachböden Angebot \n",
"Motordrehzahl 0 0 \n",
"frieren 0 0 \n",
"Klimaschächte 0 0 \n",
"Massname 0 0 \n",
"CampenAufwickler 0 0 \n",
"... ... ... \n",
"Minute 0 0 \n",
"Auffangkorb 0 0 \n",
"Deaktivierung 0 0 \n",
"Fachböden 0 0 \n",
"Angebot 0 0 \n",
"\n",
"[6468 rows x 6468 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adj_mat_undir"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"adj_mat_idx_lst = adj_mat_undir.index.to_list()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6468"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(adj_mat_idx_lst)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Find similar words to group them together"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"# test word embeddings to find similarities (e.g. Prüfung, prüfen, Überprüfung)\n",
"batch = [\n",
" 'Prüfung',\n",
" 'Anlage',\n",
" 'Überprüfung der Maschine',\n",
" 'Überprüfung',\n",
" 'prüfen',\n",
" 'Herr',\n",
" 'Datum',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"#batch = adj_mat_idx_lst.copy()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Batches: 100%|██████████| 1/1 [00:00<00:00, 11.76it/s]\n"
]
}
],
"source": [
"embds_words = model_stfr.encode(batch, show_progress_bar=True)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[85], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(embds_words) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(adj_mat_idx_lst)\n",
"\u001b[1;31mAssertionError\u001b[0m: "
]
}
],
"source": [
"assert len(embds_words) == len(adj_mat_idx_lst)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"ret = sentence_transformers.util.cos_sim(embds_words, embds_words)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"data = ret.numpy().copy()\n",
"np.fill_diagonal(data, 0)\n",
"data = np.triu(data)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"cosSim_words_df = pd.DataFrame(data=data, index=range(len(batch)), columns=range(len(batch)))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>0.301206</td>\n",
" <td>0.374930</td>\n",
" <td>0.616439</td>\n",
" <td>0.840472</td>\n",
" <td>0.291861</td>\n",
" <td>0.156846</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.167375</td>\n",
" <td>0.269911</td>\n",
" <td>0.260174</td>\n",
" <td>0.144282</td>\n",
" <td>0.124062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.610566</td>\n",
" <td>0.292862</td>\n",
" <td>0.193036</td>\n",
" <td>0.121310</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.476879</td>\n",
" <td>0.238001</td>\n",
" <td>0.139318</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.301440</td>\n",
" <td>0.153496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.184479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6\n",
"0 0.0 0.301206 0.374930 0.616439 0.840472 0.291861 0.156846\n",
"1 0.0 0.000000 0.167375 0.269911 0.260174 0.144282 0.124062\n",
"2 0.0 0.000000 0.000000 0.610566 0.292862 0.193036 0.121310\n",
"3 0.0 0.000000 0.000000 0.000000 0.476879 0.238001 0.139318\n",
"4 0.0 0.000000 0.000000 0.000000 0.000000 0.301440 0.153496\n",
"5 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.184479\n",
"6 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cosSim_words_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"COSSIM_WORDS_THRESHOLD = 0.4\n",
"arr = adj_mat_undir.to_numpy()\n",
"arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# build mapping\n",
"word_mapping = dict()\n",
"\n",
"for idx, entry in enumerate(batch):\n",
" word_mapping[idx] = entry"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"COSSIM_WORD_THRESH = 0.9\n",
"ret_thresh = filt_thresh_cosSim_matrix(cosineSim_idx_matrix=cosSim_words_df, threshold=COSSIM_WORD_THRESH)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 4505 0.961571\n",
"4 2392 0.952447\n",
"15 6057 0.948648\n",
"21 3218 0.942368\n",
"38 6171 1.000000\n",
" ... \n",
"5858 6184 1.000000\n",
"5931 6053 1.000000\n",
"6056 6134 0.926162\n",
"6328 6425 1.000000\n",
"6350 6446 1.000000\n",
"Length: 618, dtype: float32"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret_thresh"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Deckenplatte'"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_mapping[6056]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Deckplatte'"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_mapping[6134]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Threshold"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"WEIGHT_THRESHOLD = 5\n",
"arr = adj_mat_undir.to_numpy()\n",
"arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2916"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.count_nonzero(arr)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"903"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp = np.sum(arr, axis=0)\n",
"np.count_nonzero(temp)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"thresh_adj_mat = adj_mat_undir.copy()\n",
"thresh_adj_mat.loc[:] = arr"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Motordrehzahl</th>\n",
" <th>frieren</th>\n",
" <th>Klimaschächte</th>\n",
" <th>Massname</th>\n",
" <th>CampenAufwickler</th>\n",
" <th>Hängekästchen</th>\n",
" <th>Schutzbügel</th>\n",
" <th>muss</th>\n",
" <th>Endlagensensor</th>\n",
" <th>Kameralinse</th>\n",
" <th>...</th>\n",
" <th>Büroraum</th>\n",
" <th>Warten</th>\n",
" <th>Fahrens</th>\n",
" <th>Handregler</th>\n",
" <th>PM</th>\n",
" <th>Minute</th>\n",
" <th>Auffangkorb</th>\n",
" <th>Deaktivierung</th>\n",
" <th>Fachböden</th>\n",
" <th>Angebot</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Motordrehzahl</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>frieren</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Klimaschächte</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Massname</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CampenAufwickler</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Minute</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Auffangkorb</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Deaktivierung</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fachböden</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angebot</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6469 rows × 6469 columns</p>\n",
"</div>"
],
"text/plain": [
" Motordrehzahl frieren Klimaschächte Massname \\\n",
"Motordrehzahl 0 0 0 0 \n",
"frieren 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 \n",
"Massname 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 \n",
"... ... ... ... ... \n",
"Minute 0 0 0 0 \n",
"Auffangkorb 0 0 0 0 \n",
"Deaktivierung 0 0 0 0 \n",
"Fachböden 0 0 0 0 \n",
"Angebot 0 0 0 0 \n",
"\n",
" CampenAufwickler Hängekästchen Schutzbügel muss \\\n",
"Motordrehzahl 0 0 0 0 \n",
"frieren 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 \n",
"Massname 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 \n",
"... ... ... ... ... \n",
"Minute 0 0 0 0 \n",
"Auffangkorb 0 0 0 0 \n",
"Deaktivierung 0 0 0 0 \n",
"Fachböden 0 0 0 0 \n",
"Angebot 0 0 0 0 \n",
"\n",
" Endlagensensor Kameralinse ... Büroraum Warten Fahrens \\\n",
"Motordrehzahl 0 0 ... 0 0 0 \n",
"frieren 0 0 ... 0 0 0 \n",
"Klimaschächte 0 0 ... 0 0 0 \n",
"Massname 0 0 ... 0 0 0 \n",
"CampenAufwickler 0 0 ... 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"Minute 0 0 ... 0 0 0 \n",
"Auffangkorb 0 0 ... 0 0 0 \n",
"Deaktivierung 0 0 ... 0 0 0 \n",
"Fachböden 0 0 ... 0 0 0 \n",
"Angebot 0 0 ... 0 0 0 \n",
"\n",
" Handregler PM Minute Auffangkorb Deaktivierung \\\n",
"Motordrehzahl 0 0 0 0 0 \n",
"frieren 0 0 0 0 0 \n",
"Klimaschächte 0 0 0 0 0 \n",
"Massname 0 0 0 0 0 \n",
"CampenAufwickler 0 0 0 0 0 \n",
"... ... .. ... ... ... \n",
"Minute 0 0 0 0 0 \n",
"Auffangkorb 0 0 0 0 0 \n",
"Deaktivierung 0 0 0 0 0 \n",
"Fachböden 0 0 0 0 0 \n",
"Angebot 0 0 0 0 0 \n",
"\n",
" Fachböden Angebot \n",
"Motordrehzahl 0 0 \n",
"frieren 0 0 \n",
"Klimaschächte 0 0 \n",
"Massname 0 0 \n",
"CampenAufwickler 0 0 \n",
"... ... ... \n",
"Minute 0 0 \n",
"Auffangkorb 0 0 \n",
"Deaktivierung 0 0 \n",
"Fachböden 0 0 \n",
"Angebot 0 0 \n",
"\n",
"[6469 rows x 6469 columns]"
]
},
"execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"thresh_adj_mat"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"ADJ_MAT_PATH_CSV = f'./02_2_Preprocess2/20240306_adj_mat_thresh_mapping_{WEIGHT_THRESHOLD}.csv'\n",
"thresh_adj_mat.to_csv(path_or_buf=ADJ_MAT_PATH_CSV, encoding='cp1252', sep=';')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"# BERTopic"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"from bertopic import BERTopic\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sentence_transformers import SentenceTransformer\n",
" \n",
"#docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']\n",
"#model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- docs: list of texts to analyse"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to use your own embeddings, use it as follows:\n",
"\n",
" ``python\n",
" from bertopic import BERTopic\n",
" from sklearn.datasets import fetch_20newsgroups\n",
" from sentence_transformers import SentenceTransformer\n",
"\n",
" # Create embeddings\n",
" docs = fetch_20newsgroups(subset='all')['data']\n",
" sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
" embeddings = sentence_model.encode(docs, show_progress_bar=True)\n",
"\n",
" # Create topic model\n",
" topic_model = BERTopic()\n",
" topics, probs = topic_model.fit_transform(docs, embeddings)``"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"# transform all descriptions as a collection to list\n",
"descriptions = temp1['descr'].to_list()\n",
"description_batch = descriptions[:10]\n",
"description_batch = descriptions.copy()"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6790"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(description_batch)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"assert len(descriptions_w_repetition) == num_occur_total\n",
"assert len(descriptions_wo_stopwords_repetition) == num_occur_total"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"LOAD_CALC_FILES = True\n",
"LOAD_CALC_REP_FILES = True\n",
"SAVING_CALC_FILES = False\n",
"SAVING_CALC_REP_FILES = False"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"# eliminate stop words from entries\n",
"if not LOAD_CALC_FILES:\n",
" descriptions_wo_stopwords = list()\n",
"\n",
" for text in description_batch:\n",
" doc = nlp(text)\n",
" ret = [token.text for token in doc if not token.is_stop]\n",
" concat = ' '.join(ret)\n",
" \n",
" descriptions_wo_stopwords.append(concat)"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"# calculate embeddings\n",
"#embds = model_stfr.encode(description_batch, show_progress_bar=True)\n",
"\n",
"# repetition dataset too large, model on CPU using approx. 4 hours\n",
"#embds_rep = model_stfr.encode(descriptions_w_repetition, show_progress_bar=True)"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"# save\n",
"SAVE_PATH_EMBEDDINGS = './TopicModelling/embds.npy'\n",
"SAVE_PATH_EMBEDDINGS_REP = './TopicModelling/embds_rep.npy'\n",
"SAVE_PATH_WO_STOPWORDS = './TopicModelling/descr_wo_stopwords.pkl'\n",
"SAVE_PATH_WO_STOPWORDS_REP = './TopicModelling/descr_wo_stopwords_rep.pkl'\n",
"SAVE_PATH_WHOLE_REP = './TopicModelling/descr_whole_rep.pkl'\n",
"if SAVING_CALC_FILES:\n",
" np.save(SAVE_PATH_EMBEDDINGS, embds)\n",
" save_pickle(obj=descriptions_wo_stopwords, path=SAVE_PATH_WO_STOPWORDS)\n",
"if SAVING_CALC_REP_FILES:\n",
" #np.save(SAVE_PATH_EMBEDDINGS_REP, embds_rep)\n",
" save_pickle(obj=descriptions_wo_stopwords_repetition, path=SAVE_PATH_WO_STOPWORDS_REP)\n",
" save_pickle(obj=descriptions_w_repetition, path=SAVE_PATH_WHOLE_REP)"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading...\n",
"loaded\n",
"loading...\n",
"loaded\n"
]
}
],
"source": [
"# load\n",
"if LOAD_CALC_FILES:\n",
" print('loading...')\n",
" embds = np.load(SAVE_PATH_EMBEDDINGS)\n",
" #embds_rep = np.load(SAVE_PATH_EMBEDDINGS_REP)\n",
" descriptions_wo_stopwords = load_pickle(path=SAVE_PATH_WO_STOPWORDS)\n",
" print('loaded')\n",
"if LOAD_CALC_REP_FILES:\n",
" print('loading...')\n",
" descriptions_wo_stopwords_repetition = load_pickle(path=SAVE_PATH_WO_STOPWORDS_REP)\n",
" descriptions_w_repetition = load_pickle(path=SAVE_PATH_WHOLE_REP)\n",
" print('loaded')"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"assert len(descriptions_wo_stopwords) == len(description_batch)\n",
"assert len(embds) == len(description_batch)\n",
"assert len(embds) == len(descriptions_wo_stopwords)"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6790"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(embds)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load duplicate cleaned dataset\n",
"SAVE_PATH_REMOVED_DUPL = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'\n",
"\n",
"temp2 = "
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"ADJ_DF_PATH = './Graphanalyse/adj_mat_df.fth'\n",
"adj_mat_undir = pd.read_feather(ADJ_DF_PATH)\n",
"adj_mat_undir = adj_mat_undir.set_index('index')"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Verunreinigung</th>\n",
" <th>Luftreiniger</th>\n",
" <th>bedeckt</th>\n",
" <th>Schweikopf</th>\n",
" <th>Frostprävention</th>\n",
" <th>Mithilfe</th>\n",
" <th>Interne</th>\n",
" <th>Reinigung</th>\n",
" <th>Prüfen</th>\n",
" <th>Defekte</th>\n",
" <th>...</th>\n",
" <th>Visuelle</th>\n",
" <th>Rundgang</th>\n",
" <th>Rieme</th>\n",
" <th>sein</th>\n",
" <th>Eigenverantwortlichkeit</th>\n",
" <th>Lager</th>\n",
" <th>Leckage</th>\n",
" <th>werden</th>\n",
" <th>Wartungsplan</th>\n",
" <th>Monat</th>\n",
" </tr>\n",
" <tr>\n",
" <th>index</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Verunreinigung</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Luftreiniger</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bedeckt</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Schweikopf</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Frostprävention</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Lager</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leckage</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>werden</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Wartungsplan</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Monat</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>165 rows × 165 columns</p>\n",
"</div>"
],
"text/plain": [
" Verunreinigung Luftreiniger bedeckt Schweikopf \\\n",
"index \n",
"Verunreinigung 0 0 0 0 \n",
"Luftreiniger 0 0 0 0 \n",
"bedeckt 0 0 0 0 \n",
"Schweikopf 0 0 0 0 \n",
"Frostprävention 0 0 0 0 \n",
"... ... ... ... ... \n",
"Lager 0 0 0 0 \n",
"Leckage 0 0 0 0 \n",
"werden 0 0 0 0 \n",
"Wartungsplan 0 0 0 0 \n",
"Monat 0 0 0 0 \n",
"\n",
" Frostprävention Mithilfe Interne Reinigung Prüfen \\\n",
"index \n",
"Verunreinigung 0 0 0 0 0 \n",
"Luftreiniger 0 0 0 0 0 \n",
"bedeckt 0 0 0 0 0 \n",
"Schweikopf 0 0 0 0 0 \n",
"Frostprävention 0 0 0 0 0 \n",
"... ... ... ... ... ... \n",
"Lager 0 0 0 0 0 \n",
"Leckage 0 0 0 0 0 \n",
"werden 0 0 0 0 0 \n",
"Wartungsplan 0 0 0 0 0 \n",
"Monat 0 0 0 0 0 \n",
"\n",
" Defekte ... Visuelle Rundgang Rieme sein \\\n",
"index ... \n",
"Verunreinigung 0 ... 0 0 0 0 \n",
"Luftreiniger 0 ... 0 0 0 0 \n",
"bedeckt 0 ... 0 0 0 0 \n",
"Schweikopf 0 ... 0 0 0 0 \n",
"Frostprävention 0 ... 0 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"Lager 0 ... 0 0 0 0 \n",
"Leckage 0 ... 0 0 0 0 \n",
"werden 0 ... 0 0 0 0 \n",
"Wartungsplan 0 ... 0 0 0 0 \n",
"Monat 0 ... 0 0 0 0 \n",
"\n",
" Eigenverantwortlichkeit Lager Leckage werden \\\n",
"index \n",
"Verunreinigung 0 0 0 0 \n",
"Luftreiniger 0 0 0 0 \n",
"bedeckt 0 0 0 0 \n",
"Schweikopf 0 0 0 0 \n",
"Frostprävention 0 0 0 0 \n",
"... ... ... ... ... \n",
"Lager 0 0 0 0 \n",
"Leckage 0 0 0 0 \n",
"werden 0 0 0 0 \n",
"Wartungsplan 0 0 0 0 \n",
"Monat 0 0 0 0 \n",
"\n",
" Wartungsplan Monat \n",
"index \n",
"Verunreinigung 0 0 \n",
"Luftreiniger 0 0 \n",
"bedeckt 0 0 \n",
"Schweikopf 0 0 \n",
"Frostprävention 0 0 \n",
"... ... ... \n",
"Lager 0 0 \n",
"Leckage 0 0 \n",
"werden 0 0 \n",
"Wartungsplan 0 0 \n",
"Monat 0 0 \n",
"\n",
"[165 rows x 165 columns]"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adj_mat_undir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"*Repetition analysis*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"124008"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"temp1['num_occur'].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"124008"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"temp2 = temp1[['descr', 'num_occur']]\n",
"#temp2 = temp2.iloc[:10,:]\n",
"num_occur_total = temp2['num_occur'].sum()\n",
"num_occur_total"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# reconstruct dataset with number of occurences for each entry\n",
"\n",
" if not LOAD_CALC_REP_FILES:\n",
" descriptions_w_repetition = list()\n",
" descriptions_wo_stopwords_repetition = list()\n",
"\n",
" for idx, entry in enumerate(temp2.itertuples()):\n",
" num_occur = entry.num_occur\n",
" descr_whole = entry.descr\n",
" descr_wo_stopwords = descriptions_wo_stopwords[idx]\n",
" \n",
" descr_whole_rep = [descr_whole] * num_occur\n",
" descr_wo_stopwords_rep = [descr_wo_stopwords] * num_occur\n",
" \n",
" descriptions_w_repetition.extend(descr_whole_rep)\n",
" descriptions_wo_stopwords_repetition.extend(descr_wo_stopwords_rep)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"topic_model = BERTopic()\n",
"#topics, probs = topic_model.fit_transform(description_batch, embds)\n",
"topics, probs = topic_model.fit_transform(descriptions_wo_stopwords, embds)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Topic</th>\n",
" <th>Count</th>\n",
" <th>Name</th>\n",
" <th>Representation</th>\n",
" <th>Representative_Docs</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1</td>\n",
" <td>1244</td>\n",
" <td>-1_bitte_danke_prfen_strung</td>\n",
" <td>[bitte, danke, prfen, strung, herr, defekt, be...</td>\n",
" <td>[- Reinigen Gerätes Außen feuchten Reinigungst...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>332</td>\n",
" <td>0_docke_dockenwickler_belag_berziehen</td>\n",
" <td>[docke, dockenwickler, belag, berziehen, docke...</td>\n",
" <td>[docke, Docke Belag überziehen, docke überzieh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>164</td>\n",
" <td>1_motor_vortrockner_hauptmotor_servomotor</td>\n",
" <td>[motor, vortrockner, hauptmotor, servomotor, s...</td>\n",
" <td>[Vortrockner 1 Motor defekt ., Motor Geräusche...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>156</td>\n",
" <td>2_lager_umlenkwalze_tauschen_umwlzpumpe</td>\n",
" <td>[lager, umlenkwalze, tauschen, umwlzpumpe, kit...</td>\n",
" <td>[Lager defekt ., Lager Defekt, Lager defekt ! ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>155</td>\n",
" <td>3_kbk_stecker_defekt_steigdocke</td>\n",
" <td>[kbk, stecker, defekt, steigdocke, kupplung, b...</td>\n",
" <td>[Kabel Stecker defekt, Kabel Stecker defekt, K...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>141</td>\n",
" <td>11</td>\n",
" <td>141_luft_messwalze_mluft_sauglippe</td>\n",
" <td>[luft, messwalze, mluft, sauglippe, reinschaue...</td>\n",
" <td>[Sauglippe bewegt, M. läuft . Linke Bedienseit...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>142</td>\n",
" <td>11</td>\n",
" <td>142_paste_gewnschten_anschlagmittel_effekt</td>\n",
" <td>[paste, gewnschten, anschlagmittel, effekt, fh...</td>\n",
" <td>[40Stück Gewindebolzen Keramikbremsen anfertig...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>143</td>\n",
" <td>11</td>\n",
" <td>143_frostprvention_wrmetauscher_warmwasserhahn...</td>\n",
" <td>[frostprvention, wrmetauscher, warmwasserhahn,...</td>\n",
" <td>[Wärmeofen ( Funktion Line ) Hebel öffnen Ofen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>144</td>\n",
" <td>11</td>\n",
" <td>144_auffllen_aschenbecher_desifektionsmittel_l...</td>\n",
" <td>[auffllen, aschenbecher, desifektionsmittel, l...</td>\n",
" <td>[Täglicher Rundgang . ( Desifektionsmittel auf...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>145</td>\n",
" <td>10</td>\n",
" <td>145_pflasterschrank_mm_verbandsmaterial_cm</td>\n",
" <td>[pflasterschrank, mm, verbandsmaterial, cm, fi...</td>\n",
" <td>[Anfertigung Bestellung 6 Abstandseinstellplat...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>147 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Topic Count Name \\\n",
"0 -1 1244 -1_bitte_danke_prfen_strung \n",
"1 0 332 0_docke_dockenwickler_belag_berziehen \n",
"2 1 164 1_motor_vortrockner_hauptmotor_servomotor \n",
"3 2 156 2_lager_umlenkwalze_tauschen_umwlzpumpe \n",
"4 3 155 3_kbk_stecker_defekt_steigdocke \n",
".. ... ... ... \n",
"142 141 11 141_luft_messwalze_mluft_sauglippe \n",
"143 142 11 142_paste_gewnschten_anschlagmittel_effekt \n",
"144 143 11 143_frostprvention_wrmetauscher_warmwasserhahn... \n",
"145 144 11 144_auffllen_aschenbecher_desifektionsmittel_l... \n",
"146 145 10 145_pflasterschrank_mm_verbandsmaterial_cm \n",
"\n",
" Representation \\\n",
"0 [bitte, danke, prfen, strung, herr, defekt, be... \n",
"1 [docke, dockenwickler, belag, berziehen, docke... \n",
"2 [motor, vortrockner, hauptmotor, servomotor, s... \n",
"3 [lager, umlenkwalze, tauschen, umwlzpumpe, kit... \n",
"4 [kbk, stecker, defekt, steigdocke, kupplung, b... \n",
".. ... \n",
"142 [luft, messwalze, mluft, sauglippe, reinschaue... \n",
"143 [paste, gewnschten, anschlagmittel, effekt, fh... \n",
"144 [frostprvention, wrmetauscher, warmwasserhahn,... \n",
"145 [auffllen, aschenbecher, desifektionsmittel, l... \n",
"146 [pflasterschrank, mm, verbandsmaterial, cm, fi... \n",
"\n",
" Representative_Docs \n",
"0 [- Reinigen Gerätes Außen feuchten Reinigungst... \n",
"1 [docke, Docke Belag überziehen, docke überzieh... \n",
"2 [Vortrockner 1 Motor defekt ., Motor Geräusche... \n",
"3 [Lager defekt ., Lager Defekt, Lager defekt ! ... \n",
"4 [Kabel Stecker defekt, Kabel Stecker defekt, K... \n",
".. ... \n",
"142 [Sauglippe bewegt, M. läuft . Linke Bedienseit... \n",
"143 [40Stück Gewindebolzen Keramikbremsen anfertig... \n",
"144 [Wärmeofen ( Funktion Line ) Hebel öffnen Ofen... \n",
"145 [Täglicher Rundgang . ( Desifektionsmittel auf... \n",
"146 [Anfertigung Bestellung 6 Abstandseinstellplat... \n",
"\n",
"[147 rows x 5 columns]"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"topic_model.get_topic_info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Problem:**\n",
"- Modell nutzt klasische Stoppwörter mit und verfälscht Ergebnis\n",
"- BERTopic-Vorschlag: Nutzung einer Stopwortliste im Tokenizer-Modul der Pipeline\n",
"- gewählter Ansatz: Entfernung bereits nach Generierung der Embeddings\n",
"\n",
"- Verfälschung durch Nutzung von zusammengeführtem Datensatz (``num_occur``), fließt nicht mit ein\n",
"- Alternative: Rekonstruktion Datensatz mit Anzahl Einträgen --> riesiger Rechenaufwand\n",
" - CPU: Rechenzeit ungefährt 4 Stunden für Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bertopic import BERTopic\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"vectorizer_model = CountVectorizer(stop_words=\"english\")\n",
"topic_model = BERTopic(vectorizer_model=vectorizer_model)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\n\\nI am sure some bashers of Pens fans are pretty confused about the lack\\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\\nI am bit puzzled too and a bit relieved. However, I am going to put an end\\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\\nare killing those Devils worse than I thought. Jagr just showed you why\\nhe is much better than his regular season stats. He is also a lot\\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\\nregular season game. PENS RULE!!!\\n\\n\""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"topic_model = BERTopic()\n",
"topics, probs = topic_model.fit_transform(docs)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Topic</th>\n",
" <th>Count</th>\n",
" <th>Name</th>\n",
" <th>Representation</th>\n",
" <th>Representative_Docs</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1</td>\n",
" <td>27</td>\n",
" <td>-1_reinigung_und_der_von</td>\n",
" <td>[reinigung, und, der, von, berprfung, sichtkon...</td>\n",
" <td>[3-Monatliche Reinigung und Prüfung der Kühlge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>25</td>\n",
" <td>0_kontrolle_der_auf_prfen</td>\n",
" <td>[kontrolle, der, auf, prfen, wchentliche, kont...</td>\n",
" <td>[Wöchentliche Kontrolle Klimagerät Inneneinhe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>18</td>\n",
" <td>1_siehe_wartungsplan_vorgabe_extradaten</td>\n",
" <td>[siehe, wartungsplan, vorgabe, extradaten, fir...</td>\n",
" <td>[Vorgabe aus Wartungsplan Firma Menzel (siehe ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Topic Count Name \\\n",
"0 -1 27 -1_reinigung_und_der_von \n",
"1 0 25 0_kontrolle_der_auf_prfen \n",
"2 1 18 1_siehe_wartungsplan_vorgabe_extradaten \n",
"\n",
" Representation \\\n",
"0 [reinigung, und, der, von, berprfung, sichtkon... \n",
"1 [kontrolle, der, auf, prfen, wchentliche, kont... \n",
"2 [siehe, wartungsplan, vorgabe, extradaten, fir... \n",
"\n",
" Representative_Docs \n",
"0 [3-Monatliche Reinigung und Prüfung der Kühlge... \n",
"1 [Wöchentliche Kontrolle Klimagerät Inneneinhe... \n",
"2 [Vorgabe aus Wartungsplan Firma Menzel (siehe ... "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"topic_model.get_topic_info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Test Cosine Similarity\n",
"- erstelle Matrix mit Ähnlichkeits-Score (obere Dreiecksmatrix)\n",
"- jedes Wortpaar\n",
"- filtere Tabelle nach Threshold\n",
"- nutze Gewichts-Adjezenzmatrix mit Threshold als Maske\n",
" - nur Analyse von hochgewichtigen Gruppen\n",
"- analysiere Zusammenhänge in Form von Graph (ähnlich bisherigem Vorgehen)\n",
"- bilde Gruppen und benenne diese (z.B. Prüfung+Überprüfung+Kontrolle --> Überprüfung)\n",
"- baue daraus Wörterbuch und matche Begriffe bei der Erstellung"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"def build_cosine_similarity_matrix(\n",
" adj_mat\n",
"):\n",
" # obtain words to compare\n",
" words = adj_mat.index.to_list()\n",
" \n",
" # cos matrix\n",
" cos_mat = pd.DataFrame(\n",
" data=0., \n",
" columns=words, \n",
" index=words,\n",
" dtype=np.float32,\n",
" )\n",
" \n",
" for (word1, word2) in combinations(words, 2):\n",
" # obtain model vocabulary\n",
" w1 = nlp.vocab[str(word1)]\n",
" w2 = nlp.vocab[str(word2)]\n",
" # calculate cosine similarity\n",
" cos_sim = w1.similarity(w2)\n",
" # set value\n",
" cos_mat.at[word1, word2] = cos_sim\n",
" \n",
" return cos_mat"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_17216\\213623562.py:20: UserWarning: [W008] Evaluating Lexeme.similarity based on empty vectors.\n",
" cos_sim = w1.similarity(w2)\n"
]
}
],
"source": [
"cos_mat = build_cosine_similarity_matrix(adj_mat=adj_mat_undir)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Klübertemp</th>\n",
" <th>Schusssuche</th>\n",
" <th>Laser</th>\n",
" <th>Schaftteile</th>\n",
" <th>Dichtsätz</th>\n",
" <th>Tastatur</th>\n",
" <th>Vorspuleinheit</th>\n",
" <th>beginnen</th>\n",
" <th>auslesen</th>\n",
" <th>Kettspannung</th>\n",
" <th>...</th>\n",
" <th>Tänzerwalze</th>\n",
" <th>Abfallkante</th>\n",
" <th>rappeln</th>\n",
" <th>Rottenegger</th>\n",
" <th>Contrawalze</th>\n",
" <th>Eisenträger</th>\n",
" <th>Hängegurte</th>\n",
" <th>Treffen</th>\n",
" <th>Greiferarmen</th>\n",
" <th>Nadelleist</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Klübertemp</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Schusssuche</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Laser</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.324276</td>\n",
" <td>0.0</td>\n",
" <td>0.059743</td>\n",
" <td>0.133676</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-0.063913</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.167521</td>\n",
" <td>0.0</td>\n",
" <td>-0.029860</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Schaftteile</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dichtsätz</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Eisenträger</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.170954</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Hängegurte</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Treffen</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Greiferarmen</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nadelleist</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6951 rows × 6951 columns</p>\n",
"</div>"
],
"text/plain": [
" Klübertemp Schusssuche Laser Schaftteile Dichtsätz \\\n",
"Klübertemp 0.0 0.0 0.0 0.0 0.0 \n",
"Schusssuche 0.0 0.0 0.0 0.0 0.0 \n",
"Laser 0.0 0.0 0.0 0.0 0.0 \n",
"Schaftteile 0.0 0.0 0.0 0.0 0.0 \n",
"Dichtsätz 0.0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"Eisenträger 0.0 0.0 0.0 0.0 0.0 \n",
"Hängegurte 0.0 0.0 0.0 0.0 0.0 \n",
"Treffen 0.0 0.0 0.0 0.0 0.0 \n",
"Greiferarmen 0.0 0.0 0.0 0.0 0.0 \n",
"Nadelleist 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" Tastatur Vorspuleinheit beginnen auslesen Kettspannung ... \\\n",
"Klübertemp 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Schusssuche 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Laser 0.324276 0.0 0.059743 0.133676 0.0 ... \n",
"Schaftteile 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Dichtsätz 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"... ... ... ... ... ... ... \n",
"Eisenträger 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Hängegurte 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Treffen 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Greiferarmen 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"Nadelleist 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
"\n",
" Tänzerwalze Abfallkante rappeln Rottenegger Contrawalze \\\n",
"Klübertemp 0.0 0.0 0.000000 0.0 0.0 \n",
"Schusssuche 0.0 0.0 0.000000 0.0 0.0 \n",
"Laser 0.0 0.0 -0.063913 0.0 0.0 \n",
"Schaftteile 0.0 0.0 0.000000 0.0 0.0 \n",
"Dichtsätz 0.0 0.0 0.000000 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"Eisenträger 0.0 0.0 0.000000 0.0 0.0 \n",
"Hängegurte 0.0 0.0 0.000000 0.0 0.0 \n",
"Treffen 0.0 0.0 0.000000 0.0 0.0 \n",
"Greiferarmen 0.0 0.0 0.000000 0.0 0.0 \n",
"Nadelleist 0.0 0.0 0.000000 0.0 0.0 \n",
"\n",
" Eisenträger Hängegurte Treffen Greiferarmen Nadelleist \n",
"Klübertemp 0.000000 0.0 0.000000 0.0 0.0 \n",
"Schusssuche 0.000000 0.0 0.000000 0.0 0.0 \n",
"Laser 0.167521 0.0 -0.029860 0.0 0.0 \n",
"Schaftteile 0.000000 0.0 0.000000 0.0 0.0 \n",
"Dichtsätz 0.000000 0.0 0.000000 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"Eisenträger 0.000000 0.0 0.170954 0.0 0.0 \n",
"Hängegurte 0.000000 0.0 0.000000 0.0 0.0 \n",
"Treffen 0.000000 0.0 0.000000 0.0 0.0 \n",
"Greiferarmen 0.000000 0.0 0.000000 0.0 0.0 \n",
"Nadelleist 0.000000 0.0 0.000000 0.0 0.0 \n",
"\n",
"[6951 rows x 6951 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_mat"
]
},
{
"cell_type": "code",
"execution_count": 635,
"metadata": {},
"outputs": [],
"source": [
"WEIGHT_THRESHOLD = 10\n",
"arr = adj_mat_undir.to_numpy()\n",
"COS_THRESHOLD = 0.4\n",
"cos_arr = cos_mat.to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 636,
"metadata": {},
"outputs": [],
"source": [
"cos_arr_filt = np.where((cos_arr > COS_THRESHOLD) & (arr >= WEIGHT_THRESHOLD), cos_arr, 0)"
]
},
{
"cell_type": "code",
"execution_count": 637,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" ...,\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.],\n",
" [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
]
},
"execution_count": 637,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_arr_filt"
]
},
{
"cell_type": "code",
"execution_count": 638,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"217"
]
},
"execution_count": 638,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.count_nonzero(cos_arr_filt)"
]
},
{
"cell_type": "code",
"execution_count": 639,
"metadata": {},
"outputs": [],
"source": [
"thresh_cos_mat = cos_mat.copy()\n",
"thresh_cos_mat[:] = cos_arr_filt"
]
},
{
"cell_type": "code",
"execution_count": 640,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Verstärkung</th>\n",
" <th>Zuluftfilter</th>\n",
" <th>klemmt</th>\n",
" <th>Komminikation</th>\n",
" <th>Doppelholztische</th>\n",
" <th>Deckenbeleuchtung</th>\n",
" <th>Abfalltransport</th>\n",
" <th>fahrbar</th>\n",
" <th>Folieneinlauf</th>\n",
" <th>entsorgen</th>\n",
" <th>...</th>\n",
" <th>neuwertig</th>\n",
" <th>Bleit</th>\n",
" <th>Rauchentwicklung</th>\n",
" <th>Kompressorsteuerung</th>\n",
" <th>anziehen</th>\n",
" <th>Mitarbeiterin</th>\n",
" <th>Nägel</th>\n",
" <th>WZ</th>\n",
" <th>ExSchutzAnlage</th>\n",
" <th>Gemisch</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Verstärkung</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Zuluftfilter</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>klemmt</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Komminikation</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Doppelholztische</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Mitarbeiterin</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Nägel</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>WZ</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ExSchutzAnlage</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gemisch</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6951 rows × 6951 columns</p>\n",
"</div>"
],
"text/plain": [
" Verstärkung Zuluftfilter klemmt Komminikation \\\n",
"Verstärkung 0.0 0.0 0.0 0.0 \n",
"Zuluftfilter 0.0 0.0 0.0 0.0 \n",
"klemmt 0.0 0.0 0.0 0.0 \n",
"Komminikation 0.0 0.0 0.0 0.0 \n",
"Doppelholztische 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"Mitarbeiterin 0.0 0.0 0.0 0.0 \n",
"Nägel 0.0 0.0 0.0 0.0 \n",
"WZ 0.0 0.0 0.0 0.0 \n",
"ExSchutzAnlage 0.0 0.0 0.0 0.0 \n",
"Gemisch 0.0 0.0 0.0 0.0 \n",
"\n",
" Doppelholztische Deckenbeleuchtung Abfalltransport \\\n",
"Verstärkung 0.0 0.0 0.0 \n",
"Zuluftfilter 0.0 0.0 0.0 \n",
"klemmt 0.0 0.0 0.0 \n",
"Komminikation 0.0 0.0 0.0 \n",
"Doppelholztische 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"Mitarbeiterin 0.0 0.0 0.0 \n",
"Nägel 0.0 0.0 0.0 \n",
"WZ 0.0 0.0 0.0 \n",
"ExSchutzAnlage 0.0 0.0 0.0 \n",
"Gemisch 0.0 0.0 0.0 \n",
"\n",
" fahrbar Folieneinlauf entsorgen ... neuwertig Bleit \\\n",
"Verstärkung 0.0 0.0 0.0 ... 0.0 0.0 \n",
"Zuluftfilter 0.0 0.0 0.0 ... 0.0 0.0 \n",
"klemmt 0.0 0.0 0.0 ... 0.0 0.0 \n",
"Komminikation 0.0 0.0 0.0 ... 0.0 0.0 \n",
"Doppelholztische 0.0 0.0 0.0 ... 0.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"Mitarbeiterin 0.0 0.0 0.0 ... 0.0 0.0 \n",
"Nägel 0.0 0.0 0.0 ... 0.0 0.0 \n",
"WZ 0.0 0.0 0.0 ... 0.0 0.0 \n",
"ExSchutzAnlage 0.0 0.0 0.0 ... 0.0 0.0 \n",
"Gemisch 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" Rauchentwicklung Kompressorsteuerung anziehen \\\n",
"Verstärkung 0.0 0.0 0.0 \n",
"Zuluftfilter 0.0 0.0 0.0 \n",
"klemmt 0.0 0.0 0.0 \n",
"Komminikation 0.0 0.0 0.0 \n",
"Doppelholztische 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"Mitarbeiterin 0.0 0.0 0.0 \n",
"Nägel 0.0 0.0 0.0 \n",
"WZ 0.0 0.0 0.0 \n",
"ExSchutzAnlage 0.0 0.0 0.0 \n",
"Gemisch 0.0 0.0 0.0 \n",
"\n",
" Mitarbeiterin Nägel WZ ExSchutzAnlage Gemisch \n",
"Verstärkung 0.0 0.0 0.0 0.0 0.0 \n",
"Zuluftfilter 0.0 0.0 0.0 0.0 0.0 \n",
"klemmt 0.0 0.0 0.0 0.0 0.0 \n",
"Komminikation 0.0 0.0 0.0 0.0 0.0 \n",
"Doppelholztische 0.0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... \n",
"Mitarbeiterin 0.0 0.0 0.0 0.0 0.0 \n",
"Nägel 0.0 0.0 0.0 0.0 0.0 \n",
"WZ 0.0 0.0 0.0 0.0 0.0 \n",
"ExSchutzAnlage 0.0 0.0 0.0 0.0 0.0 \n",
"Gemisch 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[6951 rows x 6951 columns]"
]
},
"execution_count": 640,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"thresh_cos_mat"
]
},
{
"cell_type": "code",
"execution_count": 641,
"metadata": {},
"outputs": [],
"source": [
"COS_MAT_PATH_CSV = f'./Graphanalyse_Gruppen/cos_mat_Wthresh_{WEIGHT_THRESHOLD}_Cthresh{int(COS_THRESHOLD*100)}.csv'\n",
"thresh_cos_mat.to_csv(path_or_buf=COS_MAT_PATH_CSV, encoding='cp1252', sep=';')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}