3899 lines
127 KiB
Plaintext
3899 lines
127 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# **Analyse 2-3**\n",
|
||
"\n",
|
||
"## Weiterführung Duplikatfindung mit Sentence-Transformer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Analyse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"from pandas import DataFrame, Series\n",
|
||
"import spacy\n",
|
||
"import sentence_transformers\n",
|
||
"from sentence_transformers import SentenceTransformer\n",
|
||
"from spacy.lang.de import German as GermanSpacyModel\n",
|
||
"from collections import Counter\n",
|
||
"from itertools import combinations\n",
|
||
"from dateutil.parser import parse\n",
|
||
"import re\n",
|
||
"\n",
|
||
"import logging\n",
|
||
"import sys\n",
|
||
"import pickle\n",
|
||
"\n",
|
||
"\n",
|
||
"from ihm_analyze.helpers import (\n",
|
||
" save_pickle,\n",
|
||
" load_pickle,\n",
|
||
" build_embedding_map,\n",
|
||
" build_cosSim_matrix,\n",
|
||
" filt_thresh_cosSim_matrix,\n",
|
||
" list_cosSim_dupl_candidates,\n",
|
||
" choose_cosSim_dupl_candidates,\n",
|
||
")\n",
|
||
"\n",
|
||
"LOGGING_LEVEL = 'INFO'\n",
|
||
"logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)\n",
|
||
"logger = logging.getLogger('base')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"The autoreload extension is already loaded. To reload it, use:\n",
|
||
" %reload_ext autoreload\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"%load_ext autoreload\n",
|
||
"%autoreload 2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"LOAD_CALC_FILES = False\n",
|
||
"\n",
|
||
"DESC_BLACKLIST = set(['-'])\n",
|
||
"\"\"\"\n",
|
||
"GENERAL_BLACKLIST = set([\n",
|
||
" 'herr', 'hr.', 'förster', 'graf', 'stöppel', \n",
|
||
" 'stab', 'kw', 'h.', 'koch', 'heininger', '.',\n",
|
||
" 'schwab', 'm.', 'wenninger', '-', '--',\n",
|
||
"])\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"GENERAL_BLACKLIST = set([\n",
|
||
" 'herr', 'hr.' 'kw', 'h.', '.',\n",
|
||
" 'm.', '-', '--', 'dr.', 'dr',\n",
|
||
"])\n",
|
||
"\n",
|
||
"#GENERAL_BLACKLIST = set()\n",
|
||
"#POS_of_interest = set(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])\n",
|
||
"POS_of_interest = set(['NOUN', 'ADJ', 'VERB', 'AUX'])\n",
|
||
"TAG_of_interest = set(['ADJD'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# load language model\n",
|
||
"# transformer model without vector embeddings\n",
|
||
"# can not be used to calculate similarities\n",
|
||
"# using sentence transformers instead\n",
|
||
"nlp = spacy.load('de_dep_news_trf')\n",
|
||
"#nlp = spacy.load('de_core_news_lg')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# load dataframe from duplicate detection\n",
|
||
"FILE_PATH_TEMP1 = './02_1_Preprocess1/01_DF_num_occur_temp1.parquet'\n",
|
||
"FILE_PATH_TEMP2 = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'\n",
|
||
"temp1 = pd.read_parquet(FILE_PATH_TEMP1)\n",
|
||
"temp2 = pd.read_pickle(FILE_PATH_TEMP2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>descr</th>\n",
|
||
" <th>len</th>\n",
|
||
" <th>num_occur</th>\n",
|
||
" <th>assoc_obj_ids</th>\n",
|
||
" <th>num_assoc_obj_ids</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>162</th>\n",
|
||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>92592</td>\n",
|
||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||
" <td>206</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>1654</td>\n",
|
||
" <td>[301, 304, 305, 313, 314, 331, 332, 510, 511, ...</td>\n",
|
||
" <td>18</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>131</th>\n",
|
||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1616</td>\n",
|
||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1265</td>\n",
|
||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||
" <td>44</td>\n",
|
||
" <td>687</td>\n",
|
||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||
" <td>166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2679</th>\n",
|
||
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[415]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2678</th>\n",
|
||
" <td>Bitte 8 Scheiben nach Muster anfertigen. Danke.</td>\n",
|
||
" <td>48</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[140]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2677</th>\n",
|
||
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
|
||
" <td>126</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[323]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2676</th>\n",
|
||
" <td>Docke angefahren!</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[176]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6799</th>\n",
|
||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||
" <td>107</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[326]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6800 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" descr len num_occur \\\n",
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung 39 1654 \n",
|
||
"131 Tägliche Überprüfung der Ölabscheider 37 1616 \n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
|
||
"... ... ... ... \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
|
||
"2678 Bitte 8 Scheiben nach Muster anfertigen. Danke. 48 1 \n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
|
||
"2676 Docke angefahren! 17 1 \n",
|
||
"6799 Befestigung Deckel für Batteriefach defekt ... 107 1 \n",
|
||
"\n",
|
||
" assoc_obj_ids num_assoc_obj_ids \n",
|
||
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
|
||
"33 [301, 304, 305, 313, 314, 331, 332, 510, 511, ... 18 \n",
|
||
"131 [0, 970, 2134, 2137] 4 \n",
|
||
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
|
||
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
|
||
"... ... ... \n",
|
||
"2679 [415] 1 \n",
|
||
"2678 [140] 1 \n",
|
||
"2677 [323] 1 \n",
|
||
"2676 [176] 1 \n",
|
||
"6799 [326] 1 \n",
|
||
"\n",
|
||
"[6800 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"temp1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>descr</th>\n",
|
||
" <th>len</th>\n",
|
||
" <th>num_occur</th>\n",
|
||
" <th>assoc_obj_ids</th>\n",
|
||
" <th>num_assoc_obj_ids</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>162</th>\n",
|
||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||
" <td>66</td>\n",
|
||
" <td>92592</td>\n",
|
||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||
" <td>206</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||
" <td>39</td>\n",
|
||
" <td>2163</td>\n",
|
||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||
" <td>27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>131</th>\n",
|
||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1619</td>\n",
|
||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>160</th>\n",
|
||
" <td>Wöchentliche Kontrolle der WC-Anlagen</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>1265</td>\n",
|
||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>140</th>\n",
|
||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||
" <td>44</td>\n",
|
||
" <td>687</td>\n",
|
||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||
" <td>166</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2681</th>\n",
|
||
" <td>vom Eisenkernvorrichtung (Teil vom Kettenlauf ...</td>\n",
|
||
" <td>136</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[515]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2680</th>\n",
|
||
" <td>Stand 15.07.2020 (Stöppel): Herr Langner (Toyo...</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[311]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2679</th>\n",
|
||
" <td>Zahnräder der Laufkatze verschlissen Ersatztei...</td>\n",
|
||
" <td>170</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[415]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2677</th>\n",
|
||
" <td>Schalter für Bühne Schwenken abgerissen, bitte...</td>\n",
|
||
" <td>126</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[323]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2676</th>\n",
|
||
" <td>Docke angefahren!</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>[176]</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5090 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" descr len num_occur \\\n",
|
||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... 66 92592 \n",
|
||
"33 Wöchentliche Sichtkontrolle / Reinigung 39 2163 \n",
|
||
"131 Tägliche Überprüfung der Ölabscheider 37 1619 \n",
|
||
"160 Wöchentliche Kontrolle der WC-Anlagen 37 1265 \n",
|
||
"140 Halbjährliche Kontrolle des Stabbreithalters 44 687 \n",
|
||
"... ... ... ... \n",
|
||
"2681 vom Eisenkernvorrichtung (Teil vom Kettenlauf ... 136 1 \n",
|
||
"2680 Stand 15.07.2020 (Stöppel): Herr Langner (Toyo... 260 1 \n",
|
||
"2679 Zahnräder der Laufkatze verschlissen Ersatztei... 170 1 \n",
|
||
"2677 Schalter für Bühne Schwenken abgerissen, bitte... 126 1 \n",
|
||
"2676 Docke angefahren! 17 1 \n",
|
||
"\n",
|
||
" assoc_obj_ids num_assoc_obj_ids \n",
|
||
"162 [0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53... 206 \n",
|
||
"33 [301, 304, 305, 313, 314, 323, 329, 331, 332, ... 27 \n",
|
||
"131 [0, 970, 2134, 2137] 4 \n",
|
||
"160 [1352, 1353, 1354, 1684, 1685, 1686, 1687, 168... 11 \n",
|
||
"140 [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6... 166 \n",
|
||
"... ... ... \n",
|
||
"2681 [515] 1 \n",
|
||
"2680 [311] 1 \n",
|
||
"2679 [415] 1 \n",
|
||
"2677 [323] 1 \n",
|
||
"2676 [176] 1 \n",
|
||
"\n",
|
||
"[5090 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"temp2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# data for model training\n",
|
||
"data = temp1.iloc[50:300,0].to_list()\n",
|
||
"data = [e for e in data if e != '']\n",
|
||
"\n",
|
||
"with open('spacy_train/training_data_2.txt','w', encoding='utf-8') as f:\n",
|
||
" f.writelines(\"\\n\".join(data))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---\n",
|
||
"\n",
|
||
"*Load Adjacency Matrix*\n",
|
||
"- built in ``Analyse_4-1``"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"SAVE_PATH_ADJ_DF = './02_1_Preprocess1/04_2_adj_mat_df.parquet'\n",
|
||
"SAVE_PATH_ADJ_DF_UNDIR = './02_1_Preprocess1/04_2_adj_mat_df_undir.parquet'\n",
|
||
"\n",
|
||
"adj_mat = pd.read_parquet(SAVE_PATH_ADJ_DF)\n",
|
||
"adj_mat_undir = pd.read_parquet(SAVE_PATH_ADJ_DF_UNDIR)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Motordrehzahl</th>\n",
|
||
" <th>frieren</th>\n",
|
||
" <th>Klimaschächte</th>\n",
|
||
" <th>Massname</th>\n",
|
||
" <th>CampenAufwickler</th>\n",
|
||
" <th>Hängekästchen</th>\n",
|
||
" <th>Schutzbügel</th>\n",
|
||
" <th>muss</th>\n",
|
||
" <th>Endlagensensor</th>\n",
|
||
" <th>Kameralinse</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Büroraum</th>\n",
|
||
" <th>Warten</th>\n",
|
||
" <th>Fahrens</th>\n",
|
||
" <th>Handregler</th>\n",
|
||
" <th>PM</th>\n",
|
||
" <th>Minute</th>\n",
|
||
" <th>Auffangkorb</th>\n",
|
||
" <th>Deaktivierung</th>\n",
|
||
" <th>Fachböden</th>\n",
|
||
" <th>Angebot</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Motordrehzahl</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>frieren</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Klimaschächte</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Massname</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>CampenAufwickler</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Minute</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Auffangkorb</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Deaktivierung</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Fachböden</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Angebot</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6468 rows × 6468 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Motordrehzahl frieren Klimaschächte Massname \\\n",
|
||
"Motordrehzahl 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Minute 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 0 \n",
|
||
"Deaktivierung 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 \n",
|
||
"\n",
|
||
" CampenAufwickler Hängekästchen Schutzbügel muss \\\n",
|
||
"Motordrehzahl 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Minute 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 0 \n",
|
||
"Deaktivierung 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 \n",
|
||
"\n",
|
||
" Endlagensensor Kameralinse ... Büroraum Warten Fahrens \\\n",
|
||
"Motordrehzahl 0 0 ... 0 0 0 \n",
|
||
"frieren 0 0 ... 0 0 0 \n",
|
||
"Klimaschächte 0 0 ... 0 0 0 \n",
|
||
"Massname 0 0 ... 0 0 0 \n",
|
||
"CampenAufwickler 0 0 ... 0 0 0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"Minute 0 0 ... 0 0 0 \n",
|
||
"Auffangkorb 0 0 ... 0 0 0 \n",
|
||
"Deaktivierung 0 0 ... 0 0 0 \n",
|
||
"Fachböden 0 0 ... 0 0 0 \n",
|
||
"Angebot 0 0 ... 0 0 0 \n",
|
||
"\n",
|
||
" Handregler PM Minute Auffangkorb Deaktivierung \\\n",
|
||
"Motordrehzahl 0 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 0 \n",
|
||
"... ... .. ... ... ... \n",
|
||
"Minute 0 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 4 0 \n",
|
||
"Deaktivierung 0 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 0 \n",
|
||
"\n",
|
||
" Fachböden Angebot \n",
|
||
"Motordrehzahl 0 0 \n",
|
||
"frieren 0 0 \n",
|
||
"Klimaschächte 0 0 \n",
|
||
"Massname 0 0 \n",
|
||
"CampenAufwickler 0 0 \n",
|
||
"... ... ... \n",
|
||
"Minute 0 0 \n",
|
||
"Auffangkorb 0 0 \n",
|
||
"Deaktivierung 0 0 \n",
|
||
"Fachböden 0 0 \n",
|
||
"Angebot 0 0 \n",
|
||
"\n",
|
||
"[6468 rows x 6468 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"adj_mat_undir"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"adj_mat_idx_lst = adj_mat_undir.index.to_list()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"6468"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(adj_mat_idx_lst)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Find similar words to group them together"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 82,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# test word embeddings to find similarities (e.g. Prüfung, prüfen, Überprüfung)\n",
|
||
"batch = [\n",
|
||
" 'Prüfung',\n",
|
||
" 'Anlage',\n",
|
||
" 'Überprüfung der Maschine',\n",
|
||
" 'Überprüfung',\n",
|
||
" 'prüfen',\n",
|
||
" 'Herr',\n",
|
||
" 'Datum',\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#batch = adj_mat_idx_lst.copy()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 84,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 11.76it/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"embds_words = model_stfr.encode(batch, show_progress_bar=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 85,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "AssertionError",
|
||
"evalue": "",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[85], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(embds_words) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(adj_mat_idx_lst)\n",
|
||
"\u001b[1;31mAssertionError\u001b[0m: "
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"assert len(embds_words) == len(adj_mat_idx_lst)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"ret = sentence_transformers.util.cos_sim(embds_words, embds_words)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 87,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"data = ret.numpy().copy()\n",
|
||
"np.fill_diagonal(data, 0)\n",
|
||
"data = np.triu(data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 88,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cosSim_words_df = pd.DataFrame(data=data, index=range(len(batch)), columns=range(len(batch)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 89,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>0</th>\n",
|
||
" <th>1</th>\n",
|
||
" <th>2</th>\n",
|
||
" <th>3</th>\n",
|
||
" <th>4</th>\n",
|
||
" <th>5</th>\n",
|
||
" <th>6</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.301206</td>\n",
|
||
" <td>0.374930</td>\n",
|
||
" <td>0.616439</td>\n",
|
||
" <td>0.840472</td>\n",
|
||
" <td>0.291861</td>\n",
|
||
" <td>0.156846</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.167375</td>\n",
|
||
" <td>0.269911</td>\n",
|
||
" <td>0.260174</td>\n",
|
||
" <td>0.144282</td>\n",
|
||
" <td>0.124062</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.610566</td>\n",
|
||
" <td>0.292862</td>\n",
|
||
" <td>0.193036</td>\n",
|
||
" <td>0.121310</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.476879</td>\n",
|
||
" <td>0.238001</td>\n",
|
||
" <td>0.139318</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.301440</td>\n",
|
||
" <td>0.153496</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.184479</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" 0 1 2 3 4 5 6\n",
|
||
"0 0.0 0.301206 0.374930 0.616439 0.840472 0.291861 0.156846\n",
|
||
"1 0.0 0.000000 0.167375 0.269911 0.260174 0.144282 0.124062\n",
|
||
"2 0.0 0.000000 0.000000 0.610566 0.292862 0.193036 0.121310\n",
|
||
"3 0.0 0.000000 0.000000 0.000000 0.476879 0.238001 0.139318\n",
|
||
"4 0.0 0.000000 0.000000 0.000000 0.000000 0.301440 0.153496\n",
|
||
"5 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.184479\n",
|
||
"6 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000"
|
||
]
|
||
},
|
||
"execution_count": 89,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cosSim_words_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"COSSIM_WORDS_THRESHOLD = 0.4\n",
|
||
"arr = adj_mat_undir.to_numpy()\n",
|
||
"arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# build mapping\n",
|
||
"word_mapping = dict()\n",
|
||
"\n",
|
||
"for idx, entry in enumerate(batch):\n",
|
||
" word_mapping[idx] = entry"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 54,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"COSSIM_WORD_THRESH = 0.9\n",
|
||
"ret_thresh = filt_thresh_cosSim_matrix(cosineSim_idx_matrix=cosSim_words_df, threshold=COSSIM_WORD_THRESH)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"2 4505 0.961571\n",
|
||
"4 2392 0.952447\n",
|
||
"15 6057 0.948648\n",
|
||
"21 3218 0.942368\n",
|
||
"38 6171 1.000000\n",
|
||
" ... \n",
|
||
"5858 6184 1.000000\n",
|
||
"5931 6053 1.000000\n",
|
||
"6056 6134 0.926162\n",
|
||
"6328 6425 1.000000\n",
|
||
"6350 6446 1.000000\n",
|
||
"Length: 618, dtype: float32"
|
||
]
|
||
},
|
||
"execution_count": 55,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ret_thresh"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Deckenplatte'"
|
||
]
|
||
},
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"word_mapping[6056]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Deckplatte'"
|
||
]
|
||
},
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"word_mapping[6134]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Threshold"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 161,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"WEIGHT_THRESHOLD = 5\n",
|
||
"arr = adj_mat_undir.to_numpy()\n",
|
||
"arr = np.where(arr < WEIGHT_THRESHOLD, 0, arr)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 162,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"2916"
|
||
]
|
||
},
|
||
"execution_count": 162,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"np.count_nonzero(arr)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 163,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"903"
|
||
]
|
||
},
|
||
"execution_count": 163,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"temp = np.sum(arr, axis=0)\n",
|
||
"np.count_nonzero(temp)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 164,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"thresh_adj_mat = adj_mat_undir.copy()\n",
|
||
"thresh_adj_mat.loc[:] = arr"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 165,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Motordrehzahl</th>\n",
|
||
" <th>frieren</th>\n",
|
||
" <th>Klimaschächte</th>\n",
|
||
" <th>Massname</th>\n",
|
||
" <th>CampenAufwickler</th>\n",
|
||
" <th>Hängekästchen</th>\n",
|
||
" <th>Schutzbügel</th>\n",
|
||
" <th>muss</th>\n",
|
||
" <th>Endlagensensor</th>\n",
|
||
" <th>Kameralinse</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Büroraum</th>\n",
|
||
" <th>Warten</th>\n",
|
||
" <th>Fahrens</th>\n",
|
||
" <th>Handregler</th>\n",
|
||
" <th>PM</th>\n",
|
||
" <th>Minute</th>\n",
|
||
" <th>Auffangkorb</th>\n",
|
||
" <th>Deaktivierung</th>\n",
|
||
" <th>Fachböden</th>\n",
|
||
" <th>Angebot</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Motordrehzahl</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>frieren</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Klimaschächte</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Massname</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>CampenAufwickler</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Minute</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Auffangkorb</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Deaktivierung</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Fachböden</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Angebot</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6469 rows × 6469 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Motordrehzahl frieren Klimaschächte Massname \\\n",
|
||
"Motordrehzahl 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Minute 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 0 \n",
|
||
"Deaktivierung 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 \n",
|
||
"\n",
|
||
" CampenAufwickler Hängekästchen Schutzbügel muss \\\n",
|
||
"Motordrehzahl 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Minute 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 0 \n",
|
||
"Deaktivierung 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 \n",
|
||
"\n",
|
||
" Endlagensensor Kameralinse ... Büroraum Warten Fahrens \\\n",
|
||
"Motordrehzahl 0 0 ... 0 0 0 \n",
|
||
"frieren 0 0 ... 0 0 0 \n",
|
||
"Klimaschächte 0 0 ... 0 0 0 \n",
|
||
"Massname 0 0 ... 0 0 0 \n",
|
||
"CampenAufwickler 0 0 ... 0 0 0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"Minute 0 0 ... 0 0 0 \n",
|
||
"Auffangkorb 0 0 ... 0 0 0 \n",
|
||
"Deaktivierung 0 0 ... 0 0 0 \n",
|
||
"Fachböden 0 0 ... 0 0 0 \n",
|
||
"Angebot 0 0 ... 0 0 0 \n",
|
||
"\n",
|
||
" Handregler PM Minute Auffangkorb Deaktivierung \\\n",
|
||
"Motordrehzahl 0 0 0 0 0 \n",
|
||
"frieren 0 0 0 0 0 \n",
|
||
"Klimaschächte 0 0 0 0 0 \n",
|
||
"Massname 0 0 0 0 0 \n",
|
||
"CampenAufwickler 0 0 0 0 0 \n",
|
||
"... ... .. ... ... ... \n",
|
||
"Minute 0 0 0 0 0 \n",
|
||
"Auffangkorb 0 0 0 0 0 \n",
|
||
"Deaktivierung 0 0 0 0 0 \n",
|
||
"Fachböden 0 0 0 0 0 \n",
|
||
"Angebot 0 0 0 0 0 \n",
|
||
"\n",
|
||
" Fachböden Angebot \n",
|
||
"Motordrehzahl 0 0 \n",
|
||
"frieren 0 0 \n",
|
||
"Klimaschächte 0 0 \n",
|
||
"Massname 0 0 \n",
|
||
"CampenAufwickler 0 0 \n",
|
||
"... ... ... \n",
|
||
"Minute 0 0 \n",
|
||
"Auffangkorb 0 0 \n",
|
||
"Deaktivierung 0 0 \n",
|
||
"Fachböden 0 0 \n",
|
||
"Angebot 0 0 \n",
|
||
"\n",
|
||
"[6469 rows x 6469 columns]"
|
||
]
|
||
},
|
||
"execution_count": 165,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"thresh_adj_mat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 166,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"ADJ_MAT_PATH_CSV = f'./02_2_Preprocess2/20240306_adj_mat_thresh_mapping_{WEIGHT_THRESHOLD}.csv'\n",
|
||
"thresh_adj_mat.to_csv(path_or_buf=ADJ_MAT_PATH_CSV, encoding='cp1252', sep=';')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---\n",
|
||
"\n",
|
||
"# BERTopic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from bertopic import BERTopic\n",
|
||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
"from sentence_transformers import SentenceTransformer\n",
|
||
" \n",
|
||
"#docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']\n",
|
||
"#model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"- docs: list of texts to analyse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"If you want to use your own embeddings, use it as follows:\n",
|
||
"\n",
|
||
" ``python\n",
|
||
" from bertopic import BERTopic\n",
|
||
" from sklearn.datasets import fetch_20newsgroups\n",
|
||
" from sentence_transformers import SentenceTransformer\n",
|
||
"\n",
|
||
" # Create embeddings\n",
|
||
" docs = fetch_20newsgroups(subset='all')['data']\n",
|
||
" sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
|
||
" embeddings = sentence_model.encode(docs, show_progress_bar=True)\n",
|
||
"\n",
|
||
" # Create topic model\n",
|
||
" topic_model = BERTopic()\n",
|
||
" topics, probs = topic_model.fit_transform(docs, embeddings)``"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 105,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# transform all descriptions as a collection to list\n",
|
||
"descriptions = temp1['descr'].to_list()\n",
|
||
"description_batch = descriptions[:10]\n",
|
||
"description_batch = descriptions.copy()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 106,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"6790"
|
||
]
|
||
},
|
||
"execution_count": 106,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(description_batch)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 111,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"assert len(descriptions_w_repetition) == num_occur_total\n",
|
||
"assert len(descriptions_wo_stopwords_repetition) == num_occur_total"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 127,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"LOAD_CALC_FILES = True\n",
|
||
"LOAD_CALC_REP_FILES = True\n",
|
||
"SAVING_CALC_FILES = False\n",
|
||
"SAVING_CALC_REP_FILES = False"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 128,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# eliminate stop words from entries\n",
|
||
"if not LOAD_CALC_FILES:\n",
|
||
" descriptions_wo_stopwords = list()\n",
|
||
"\n",
|
||
" for text in description_batch:\n",
|
||
" doc = nlp(text)\n",
|
||
" ret = [token.text for token in doc if not token.is_stop]\n",
|
||
" concat = ' '.join(ret)\n",
|
||
" \n",
|
||
" descriptions_wo_stopwords.append(concat)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 129,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# calculate embeddings\n",
|
||
"#embds = model_stfr.encode(description_batch, show_progress_bar=True)\n",
|
||
"\n",
|
||
"# repetition dataset too large, model on CPU using approx. 4 hours\n",
|
||
"#embds_rep = model_stfr.encode(descriptions_w_repetition, show_progress_bar=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 130,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# save\n",
|
||
"SAVE_PATH_EMBEDDINGS = './TopicModelling/embds.npy'\n",
|
||
"SAVE_PATH_EMBEDDINGS_REP = './TopicModelling/embds_rep.npy'\n",
|
||
"SAVE_PATH_WO_STOPWORDS = './TopicModelling/descr_wo_stopwords.pkl'\n",
|
||
"SAVE_PATH_WO_STOPWORDS_REP = './TopicModelling/descr_wo_stopwords_rep.pkl'\n",
|
||
"SAVE_PATH_WHOLE_REP = './TopicModelling/descr_whole_rep.pkl'\n",
|
||
"if SAVING_CALC_FILES:\n",
|
||
" np.save(SAVE_PATH_EMBEDDINGS, embds)\n",
|
||
" save_pickle(obj=descriptions_wo_stopwords, path=SAVE_PATH_WO_STOPWORDS)\n",
|
||
"if SAVING_CALC_REP_FILES:\n",
|
||
" #np.save(SAVE_PATH_EMBEDDINGS_REP, embds_rep)\n",
|
||
" save_pickle(obj=descriptions_wo_stopwords_repetition, path=SAVE_PATH_WO_STOPWORDS_REP)\n",
|
||
" save_pickle(obj=descriptions_w_repetition, path=SAVE_PATH_WHOLE_REP)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 131,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"loading...\n",
|
||
"loaded\n",
|
||
"loading...\n",
|
||
"loaded\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# load\n",
|
||
"if LOAD_CALC_FILES:\n",
|
||
" print('loading...')\n",
|
||
" embds = np.load(SAVE_PATH_EMBEDDINGS)\n",
|
||
" #embds_rep = np.load(SAVE_PATH_EMBEDDINGS_REP)\n",
|
||
" descriptions_wo_stopwords = load_pickle(path=SAVE_PATH_WO_STOPWORDS)\n",
|
||
" print('loaded')\n",
|
||
"if LOAD_CALC_REP_FILES:\n",
|
||
" print('loading...')\n",
|
||
" descriptions_wo_stopwords_repetition = load_pickle(path=SAVE_PATH_WO_STOPWORDS_REP)\n",
|
||
" descriptions_w_repetition = load_pickle(path=SAVE_PATH_WHOLE_REP)\n",
|
||
" print('loaded')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"assert len(descriptions_wo_stopwords) == len(description_batch)\n",
|
||
"assert len(embds) == len(description_batch)\n",
|
||
"assert len(embds) == len(descriptions_wo_stopwords)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 126,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"6790"
|
||
]
|
||
},
|
||
"execution_count": 126,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(embds)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# load duplicate cleaned dataset\n",
|
||
"SAVE_PATH_REMOVED_DUPL = './02_1_Preprocess1/03_dataset_remov_dupl_similar_whole.pkl'\n",
|
||
"\n",
|
||
"temp2 = "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 157,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"ADJ_DF_PATH = './Graphanalyse/adj_mat_df.fth'\n",
|
||
"adj_mat_undir = pd.read_feather(ADJ_DF_PATH)\n",
|
||
"adj_mat_undir = adj_mat_undir.set_index('index')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 158,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Verunreinigung</th>\n",
|
||
" <th>Luftreiniger</th>\n",
|
||
" <th>bedeckt</th>\n",
|
||
" <th>Schweikopf</th>\n",
|
||
" <th>Frostprävention</th>\n",
|
||
" <th>Mithilfe</th>\n",
|
||
" <th>Interne</th>\n",
|
||
" <th>Reinigung</th>\n",
|
||
" <th>Prüfen</th>\n",
|
||
" <th>Defekte</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Visuelle</th>\n",
|
||
" <th>Rundgang</th>\n",
|
||
" <th>Rieme</th>\n",
|
||
" <th>sein</th>\n",
|
||
" <th>Eigenverantwortlichkeit</th>\n",
|
||
" <th>Lager</th>\n",
|
||
" <th>Leckage</th>\n",
|
||
" <th>werden</th>\n",
|
||
" <th>Wartungsplan</th>\n",
|
||
" <th>Monat</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>index</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Verunreinigung</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Luftreiniger</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>bedeckt</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Schweikopf</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Frostprävention</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Lager</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Leckage</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>werden</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Wartungsplan</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Monat</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>165 rows × 165 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Verunreinigung Luftreiniger bedeckt Schweikopf \\\n",
|
||
"index \n",
|
||
"Verunreinigung 0 0 0 0 \n",
|
||
"Luftreiniger 0 0 0 0 \n",
|
||
"bedeckt 0 0 0 0 \n",
|
||
"Schweikopf 0 0 0 0 \n",
|
||
"Frostprävention 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Lager 0 0 0 0 \n",
|
||
"Leckage 0 0 0 0 \n",
|
||
"werden 0 0 0 0 \n",
|
||
"Wartungsplan 0 0 0 0 \n",
|
||
"Monat 0 0 0 0 \n",
|
||
"\n",
|
||
" Frostprävention Mithilfe Interne Reinigung Prüfen \\\n",
|
||
"index \n",
|
||
"Verunreinigung 0 0 0 0 0 \n",
|
||
"Luftreiniger 0 0 0 0 0 \n",
|
||
"bedeckt 0 0 0 0 0 \n",
|
||
"Schweikopf 0 0 0 0 0 \n",
|
||
"Frostprävention 0 0 0 0 0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"Lager 0 0 0 0 0 \n",
|
||
"Leckage 0 0 0 0 0 \n",
|
||
"werden 0 0 0 0 0 \n",
|
||
"Wartungsplan 0 0 0 0 0 \n",
|
||
"Monat 0 0 0 0 0 \n",
|
||
"\n",
|
||
" Defekte ... Visuelle Rundgang Rieme sein \\\n",
|
||
"index ... \n",
|
||
"Verunreinigung 0 ... 0 0 0 0 \n",
|
||
"Luftreiniger 0 ... 0 0 0 0 \n",
|
||
"bedeckt 0 ... 0 0 0 0 \n",
|
||
"Schweikopf 0 ... 0 0 0 0 \n",
|
||
"Frostprävention 0 ... 0 0 0 0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"Lager 0 ... 0 0 0 0 \n",
|
||
"Leckage 0 ... 0 0 0 0 \n",
|
||
"werden 0 ... 0 0 0 0 \n",
|
||
"Wartungsplan 0 ... 0 0 0 0 \n",
|
||
"Monat 0 ... 0 0 0 0 \n",
|
||
"\n",
|
||
" Eigenverantwortlichkeit Lager Leckage werden \\\n",
|
||
"index \n",
|
||
"Verunreinigung 0 0 0 0 \n",
|
||
"Luftreiniger 0 0 0 0 \n",
|
||
"bedeckt 0 0 0 0 \n",
|
||
"Schweikopf 0 0 0 0 \n",
|
||
"Frostprävention 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Lager 0 0 0 0 \n",
|
||
"Leckage 0 0 0 0 \n",
|
||
"werden 0 0 0 0 \n",
|
||
"Wartungsplan 0 0 0 0 \n",
|
||
"Monat 0 0 0 0 \n",
|
||
"\n",
|
||
" Wartungsplan Monat \n",
|
||
"index \n",
|
||
"Verunreinigung 0 0 \n",
|
||
"Luftreiniger 0 0 \n",
|
||
"bedeckt 0 0 \n",
|
||
"Schweikopf 0 0 \n",
|
||
"Frostprävention 0 0 \n",
|
||
"... ... ... \n",
|
||
"Lager 0 0 \n",
|
||
"Leckage 0 0 \n",
|
||
"werden 0 0 \n",
|
||
"Wartungsplan 0 0 \n",
|
||
"Monat 0 0 \n",
|
||
"\n",
|
||
"[165 rows x 165 columns]"
|
||
]
|
||
},
|
||
"execution_count": 158,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"adj_mat_undir"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---\n",
|
||
"\n",
|
||
"*Repetition analysis*"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"124008"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"temp1['num_occur'].sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"124008"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"temp2 = temp1[['descr', 'num_occur']]\n",
|
||
"#temp2 = temp2.iloc[:10,:]\n",
|
||
"num_occur_total = temp2['num_occur'].sum()\n",
|
||
"num_occur_total"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# reconstruct dataset with number of occurences for each entry\n",
|
||
"\n",
|
||
" if not LOAD_CALC_REP_FILES:\n",
|
||
" descriptions_w_repetition = list()\n",
|
||
" descriptions_wo_stopwords_repetition = list()\n",
|
||
"\n",
|
||
" for idx, entry in enumerate(temp2.itertuples()):\n",
|
||
" num_occur = entry.num_occur\n",
|
||
" descr_whole = entry.descr\n",
|
||
" descr_wo_stopwords = descriptions_wo_stopwords[idx]\n",
|
||
" \n",
|
||
" descr_whole_rep = [descr_whole] * num_occur\n",
|
||
" descr_wo_stopwords_rep = [descr_wo_stopwords] * num_occur\n",
|
||
" \n",
|
||
" descriptions_w_repetition.extend(descr_whole_rep)\n",
|
||
" descriptions_wo_stopwords_repetition.extend(descr_wo_stopwords_rep)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"---"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"topic_model = BERTopic()\n",
|
||
"#topics, probs = topic_model.fit_transform(description_batch, embds)\n",
|
||
"topics, probs = topic_model.fit_transform(descriptions_wo_stopwords, embds)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Topic</th>\n",
|
||
" <th>Count</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Representation</th>\n",
|
||
" <th>Representative_Docs</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>1244</td>\n",
|
||
" <td>-1_bitte_danke_prfen_strung</td>\n",
|
||
" <td>[bitte, danke, prfen, strung, herr, defekt, be...</td>\n",
|
||
" <td>[- Reinigen Gerätes Außen feuchten Reinigungst...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>332</td>\n",
|
||
" <td>0_docke_dockenwickler_belag_berziehen</td>\n",
|
||
" <td>[docke, dockenwickler, belag, berziehen, docke...</td>\n",
|
||
" <td>[docke, Docke Belag überziehen, docke überzieh...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>164</td>\n",
|
||
" <td>1_motor_vortrockner_hauptmotor_servomotor</td>\n",
|
||
" <td>[motor, vortrockner, hauptmotor, servomotor, s...</td>\n",
|
||
" <td>[Vortrockner 1 Motor defekt ., Motor Geräusche...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>156</td>\n",
|
||
" <td>2_lager_umlenkwalze_tauschen_umwlzpumpe</td>\n",
|
||
" <td>[lager, umlenkwalze, tauschen, umwlzpumpe, kit...</td>\n",
|
||
" <td>[Lager defekt ., Lager Defekt, Lager defekt ! ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>155</td>\n",
|
||
" <td>3_kbk_stecker_defekt_steigdocke</td>\n",
|
||
" <td>[kbk, stecker, defekt, steigdocke, kupplung, b...</td>\n",
|
||
" <td>[Kabel Stecker defekt, Kabel Stecker defekt, K...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>142</th>\n",
|
||
" <td>141</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>141_luft_messwalze_mluft_sauglippe</td>\n",
|
||
" <td>[luft, messwalze, mluft, sauglippe, reinschaue...</td>\n",
|
||
" <td>[Sauglippe bewegt, M. läuft . Linke Bedienseit...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>143</th>\n",
|
||
" <td>142</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>142_paste_gewnschten_anschlagmittel_effekt</td>\n",
|
||
" <td>[paste, gewnschten, anschlagmittel, effekt, fh...</td>\n",
|
||
" <td>[40Stück Gewindebolzen Keramikbremsen anfertig...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>144</th>\n",
|
||
" <td>143</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>143_frostprvention_wrmetauscher_warmwasserhahn...</td>\n",
|
||
" <td>[frostprvention, wrmetauscher, warmwasserhahn,...</td>\n",
|
||
" <td>[Wärmeofen ( Funktion Line ) Hebel öffnen Ofen...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>145</th>\n",
|
||
" <td>144</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>144_auffllen_aschenbecher_desifektionsmittel_l...</td>\n",
|
||
" <td>[auffllen, aschenbecher, desifektionsmittel, l...</td>\n",
|
||
" <td>[Täglicher Rundgang . ( Desifektionsmittel auf...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>146</th>\n",
|
||
" <td>145</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>145_pflasterschrank_mm_verbandsmaterial_cm</td>\n",
|
||
" <td>[pflasterschrank, mm, verbandsmaterial, cm, fi...</td>\n",
|
||
" <td>[Anfertigung Bestellung 6 Abstandseinstellplat...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>147 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Topic Count Name \\\n",
|
||
"0 -1 1244 -1_bitte_danke_prfen_strung \n",
|
||
"1 0 332 0_docke_dockenwickler_belag_berziehen \n",
|
||
"2 1 164 1_motor_vortrockner_hauptmotor_servomotor \n",
|
||
"3 2 156 2_lager_umlenkwalze_tauschen_umwlzpumpe \n",
|
||
"4 3 155 3_kbk_stecker_defekt_steigdocke \n",
|
||
".. ... ... ... \n",
|
||
"142 141 11 141_luft_messwalze_mluft_sauglippe \n",
|
||
"143 142 11 142_paste_gewnschten_anschlagmittel_effekt \n",
|
||
"144 143 11 143_frostprvention_wrmetauscher_warmwasserhahn... \n",
|
||
"145 144 11 144_auffllen_aschenbecher_desifektionsmittel_l... \n",
|
||
"146 145 10 145_pflasterschrank_mm_verbandsmaterial_cm \n",
|
||
"\n",
|
||
" Representation \\\n",
|
||
"0 [bitte, danke, prfen, strung, herr, defekt, be... \n",
|
||
"1 [docke, dockenwickler, belag, berziehen, docke... \n",
|
||
"2 [motor, vortrockner, hauptmotor, servomotor, s... \n",
|
||
"3 [lager, umlenkwalze, tauschen, umwlzpumpe, kit... \n",
|
||
"4 [kbk, stecker, defekt, steigdocke, kupplung, b... \n",
|
||
".. ... \n",
|
||
"142 [luft, messwalze, mluft, sauglippe, reinschaue... \n",
|
||
"143 [paste, gewnschten, anschlagmittel, effekt, fh... \n",
|
||
"144 [frostprvention, wrmetauscher, warmwasserhahn,... \n",
|
||
"145 [auffllen, aschenbecher, desifektionsmittel, l... \n",
|
||
"146 [pflasterschrank, mm, verbandsmaterial, cm, fi... \n",
|
||
"\n",
|
||
" Representative_Docs \n",
|
||
"0 [- Reinigen Gerätes Außen feuchten Reinigungst... \n",
|
||
"1 [docke, Docke Belag überziehen, docke überzieh... \n",
|
||
"2 [Vortrockner 1 Motor defekt ., Motor Geräusche... \n",
|
||
"3 [Lager defekt ., Lager Defekt, Lager defekt ! ... \n",
|
||
"4 [Kabel Stecker defekt, Kabel Stecker defekt, K... \n",
|
||
".. ... \n",
|
||
"142 [Sauglippe bewegt, M. läuft . Linke Bedienseit... \n",
|
||
"143 [40Stück Gewindebolzen Keramikbremsen anfertig... \n",
|
||
"144 [Wärmeofen ( Funktion Line ) Hebel öffnen Ofen... \n",
|
||
"145 [Täglicher Rundgang . ( Desifektionsmittel auf... \n",
|
||
"146 [Anfertigung Bestellung 6 Abstandseinstellplat... \n",
|
||
"\n",
|
||
"[147 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"topic_model.get_topic_info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**Problem:**\n",
|
||
"- Modell nutzt klasische Stoppwörter mit und verfälscht Ergebnis\n",
|
||
"- BERTopic-Vorschlag: Nutzung einer Stopwortliste im Tokenizer-Modul der Pipeline\n",
|
||
"- gewählter Ansatz: Entfernung bereits nach Generierung der Embeddings\n",
|
||
"\n",
|
||
"- Verfälschung durch Nutzung von zusammengeführtem Datensatz (``num_occur``), fließt nicht mit ein\n",
|
||
"- Alternative: Rekonstruktion Datensatz mit Anzahl Einträgen --> riesiger Rechenaufwand\n",
|
||
" - CPU: Rechenzeit ungefährt 4 Stunden für Embeddings"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from bertopic import BERTopic\n",
|
||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
"\n",
|
||
"vectorizer_model = CountVectorizer(stop_words=\"english\")\n",
|
||
"topic_model = BERTopic(vectorizer_model=vectorizer_model)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"\"\\n\\nI am sure some bashers of Pens fans are pretty confused about the lack\\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\\nI am bit puzzled too and a bit relieved. However, I am going to put an end\\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\\nare killing those Devils worse than I thought. Jagr just showed you why\\nhe is much better than his regular season stats. He is also a lot\\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\\nregular season game. PENS RULE!!!\\n\\n\""
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"docs[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"topic_model = BERTopic()\n",
|
||
"topics, probs = topic_model.fit_transform(docs)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Topic</th>\n",
|
||
" <th>Count</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Representation</th>\n",
|
||
" <th>Representative_Docs</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>27</td>\n",
|
||
" <td>-1_reinigung_und_der_von</td>\n",
|
||
" <td>[reinigung, und, der, von, berprfung, sichtkon...</td>\n",
|
||
" <td>[3-Monatliche Reinigung und Prüfung der Kühlge...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>0_kontrolle_der_auf_prfen</td>\n",
|
||
" <td>[kontrolle, der, auf, prfen, wchentliche, kont...</td>\n",
|
||
" <td>[Wöchentliche Kontrolle Klimagerät Inneneinhe...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>18</td>\n",
|
||
" <td>1_siehe_wartungsplan_vorgabe_extradaten</td>\n",
|
||
" <td>[siehe, wartungsplan, vorgabe, extradaten, fir...</td>\n",
|
||
" <td>[Vorgabe aus Wartungsplan Firma Menzel (siehe ...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Topic Count Name \\\n",
|
||
"0 -1 27 -1_reinigung_und_der_von \n",
|
||
"1 0 25 0_kontrolle_der_auf_prfen \n",
|
||
"2 1 18 1_siehe_wartungsplan_vorgabe_extradaten \n",
|
||
"\n",
|
||
" Representation \\\n",
|
||
"0 [reinigung, und, der, von, berprfung, sichtkon... \n",
|
||
"1 [kontrolle, der, auf, prfen, wchentliche, kont... \n",
|
||
"2 [siehe, wartungsplan, vorgabe, extradaten, fir... \n",
|
||
"\n",
|
||
" Representative_Docs \n",
|
||
"0 [3-Monatliche Reinigung und Prüfung der Kühlge... \n",
|
||
"1 [Wöchentliche Kontrolle Klimagerät Inneneinhe... \n",
|
||
"2 [Vorgabe aus Wartungsplan Firma Menzel (siehe ... "
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"topic_model.get_topic_info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Test Cosine Similarity\n",
|
||
"- erstelle Matrix mit Ähnlichkeits-Score (obere Dreiecksmatrix)\n",
|
||
"- jedes Wortpaar\n",
|
||
"- filtere Tabelle nach Threshold\n",
|
||
"- nutze Gewichts-Adjezenzmatrix mit Threshold als Maske\n",
|
||
" - nur Analyse von hochgewichtigen Gruppen\n",
|
||
"- analysiere Zusammenhänge in Form von Graph (ähnlich bisherigem Vorgehen)\n",
|
||
"- bilde Gruppen und benenne diese (z.B. Prüfung+Überprüfung+Kontrolle --> Überprüfung)\n",
|
||
"- baue daraus Wörterbuch und matche Begriffe bei der Erstellung"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def build_cosine_similarity_matrix(\n",
|
||
" adj_mat\n",
|
||
"):\n",
|
||
" # obtain words to compare\n",
|
||
" words = adj_mat.index.to_list()\n",
|
||
" \n",
|
||
" # cos matrix\n",
|
||
" cos_mat = pd.DataFrame(\n",
|
||
" data=0., \n",
|
||
" columns=words, \n",
|
||
" index=words,\n",
|
||
" dtype=np.float32,\n",
|
||
" )\n",
|
||
" \n",
|
||
" for (word1, word2) in combinations(words, 2):\n",
|
||
" # obtain model vocabulary\n",
|
||
" w1 = nlp.vocab[str(word1)]\n",
|
||
" w2 = nlp.vocab[str(word2)]\n",
|
||
" # calculate cosine similarity\n",
|
||
" cos_sim = w1.similarity(w2)\n",
|
||
" # set value\n",
|
||
" cos_mat.at[word1, word2] = cos_sim\n",
|
||
" \n",
|
||
" return cos_mat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_17216\\213623562.py:20: UserWarning: [W008] Evaluating Lexeme.similarity based on empty vectors.\n",
|
||
" cos_sim = w1.similarity(w2)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"cos_mat = build_cosine_similarity_matrix(adj_mat=adj_mat_undir)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Klübertemp</th>\n",
|
||
" <th>Schusssuche</th>\n",
|
||
" <th>Laser</th>\n",
|
||
" <th>Schaftteile</th>\n",
|
||
" <th>Dichtsätz</th>\n",
|
||
" <th>Tastatur</th>\n",
|
||
" <th>Vorspuleinheit</th>\n",
|
||
" <th>beginnen</th>\n",
|
||
" <th>auslesen</th>\n",
|
||
" <th>Kettspannung</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Tänzerwalze</th>\n",
|
||
" <th>Abfallkante</th>\n",
|
||
" <th>rappeln</th>\n",
|
||
" <th>Rottenegger</th>\n",
|
||
" <th>Contrawalze</th>\n",
|
||
" <th>Eisenträger</th>\n",
|
||
" <th>Hängegurte</th>\n",
|
||
" <th>Treffen</th>\n",
|
||
" <th>Greiferarmen</th>\n",
|
||
" <th>Nadelleist</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Klübertemp</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Schusssuche</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Laser</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.324276</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.059743</td>\n",
|
||
" <td>0.133676</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>-0.063913</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.167521</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>-0.029860</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Schaftteile</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Dichtsätz</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Eisenträger</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.170954</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Hängegurte</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Treffen</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Greiferarmen</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Nadelleist</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6951 rows × 6951 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Klübertemp Schusssuche Laser Schaftteile Dichtsätz \\\n",
|
||
"Klübertemp 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Schusssuche 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Laser 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Schaftteile 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Dichtsätz 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"Eisenträger 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Hängegurte 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Treffen 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Greiferarmen 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Nadelleist 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Tastatur Vorspuleinheit beginnen auslesen Kettspannung ... \\\n",
|
||
"Klübertemp 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Schusssuche 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Laser 0.324276 0.0 0.059743 0.133676 0.0 ... \n",
|
||
"Schaftteile 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Dichtsätz 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"Eisenträger 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Hängegurte 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Treffen 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Greiferarmen 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"Nadelleist 0.000000 0.0 0.000000 0.000000 0.0 ... \n",
|
||
"\n",
|
||
" Tänzerwalze Abfallkante rappeln Rottenegger Contrawalze \\\n",
|
||
"Klübertemp 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Schusssuche 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Laser 0.0 0.0 -0.063913 0.0 0.0 \n",
|
||
"Schaftteile 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Dichtsätz 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"Eisenträger 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Hängegurte 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Treffen 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Greiferarmen 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"Nadelleist 0.0 0.0 0.000000 0.0 0.0 \n",
|
||
"\n",
|
||
" Eisenträger Hängegurte Treffen Greiferarmen Nadelleist \n",
|
||
"Klübertemp 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Schusssuche 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Laser 0.167521 0.0 -0.029860 0.0 0.0 \n",
|
||
"Schaftteile 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Dichtsätz 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"Eisenträger 0.000000 0.0 0.170954 0.0 0.0 \n",
|
||
"Hängegurte 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Treffen 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Greiferarmen 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"Nadelleist 0.000000 0.0 0.000000 0.0 0.0 \n",
|
||
"\n",
|
||
"[6951 rows x 6951 columns]"
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cos_mat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 635,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"WEIGHT_THRESHOLD = 10\n",
|
||
"arr = adj_mat_undir.to_numpy()\n",
|
||
"COS_THRESHOLD = 0.4\n",
|
||
"cos_arr = cos_mat.to_numpy()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 636,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cos_arr_filt = np.where((cos_arr > COS_THRESHOLD) & (arr >= WEIGHT_THRESHOLD), cos_arr, 0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 637,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[0., 0., 0., ..., 0., 0., 0.],\n",
|
||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||
" ...,\n",
|
||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||
" [0., 0., 0., ..., 0., 0., 0.],\n",
|
||
" [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
|
||
]
|
||
},
|
||
"execution_count": 637,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cos_arr_filt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 638,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"217"
|
||
]
|
||
},
|
||
"execution_count": 638,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"np.count_nonzero(cos_arr_filt)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 639,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"thresh_cos_mat = cos_mat.copy()\n",
|
||
"thresh_cos_mat[:] = cos_arr_filt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 640,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Verstärkung</th>\n",
|
||
" <th>Zuluftfilter</th>\n",
|
||
" <th>klemmt</th>\n",
|
||
" <th>Komminikation</th>\n",
|
||
" <th>Doppelholztische</th>\n",
|
||
" <th>Deckenbeleuchtung</th>\n",
|
||
" <th>Abfalltransport</th>\n",
|
||
" <th>fahrbar</th>\n",
|
||
" <th>Folieneinlauf</th>\n",
|
||
" <th>entsorgen</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>neuwertig</th>\n",
|
||
" <th>Bleit</th>\n",
|
||
" <th>Rauchentwicklung</th>\n",
|
||
" <th>Kompressorsteuerung</th>\n",
|
||
" <th>anziehen</th>\n",
|
||
" <th>Mitarbeiterin</th>\n",
|
||
" <th>Nägel</th>\n",
|
||
" <th>WZ</th>\n",
|
||
" <th>ExSchutzAnlage</th>\n",
|
||
" <th>Gemisch</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Verstärkung</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Zuluftfilter</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>klemmt</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Komminikation</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Doppelholztische</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Mitarbeiterin</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Nägel</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>WZ</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ExSchutzAnlage</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Gemisch</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6951 rows × 6951 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Verstärkung Zuluftfilter klemmt Komminikation \\\n",
|
||
"Verstärkung 0.0 0.0 0.0 0.0 \n",
|
||
"Zuluftfilter 0.0 0.0 0.0 0.0 \n",
|
||
"klemmt 0.0 0.0 0.0 0.0 \n",
|
||
"Komminikation 0.0 0.0 0.0 0.0 \n",
|
||
"Doppelholztische 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"Mitarbeiterin 0.0 0.0 0.0 0.0 \n",
|
||
"Nägel 0.0 0.0 0.0 0.0 \n",
|
||
"WZ 0.0 0.0 0.0 0.0 \n",
|
||
"ExSchutzAnlage 0.0 0.0 0.0 0.0 \n",
|
||
"Gemisch 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Doppelholztische Deckenbeleuchtung Abfalltransport \\\n",
|
||
"Verstärkung 0.0 0.0 0.0 \n",
|
||
"Zuluftfilter 0.0 0.0 0.0 \n",
|
||
"klemmt 0.0 0.0 0.0 \n",
|
||
"Komminikation 0.0 0.0 0.0 \n",
|
||
"Doppelholztische 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"Mitarbeiterin 0.0 0.0 0.0 \n",
|
||
"Nägel 0.0 0.0 0.0 \n",
|
||
"WZ 0.0 0.0 0.0 \n",
|
||
"ExSchutzAnlage 0.0 0.0 0.0 \n",
|
||
"Gemisch 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" fahrbar Folieneinlauf entsorgen ... neuwertig Bleit \\\n",
|
||
"Verstärkung 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"Zuluftfilter 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"klemmt 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"Komminikation 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"Doppelholztische 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"Mitarbeiterin 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"Nägel 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"WZ 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"ExSchutzAnlage 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"Gemisch 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||
"\n",
|
||
" Rauchentwicklung Kompressorsteuerung anziehen \\\n",
|
||
"Verstärkung 0.0 0.0 0.0 \n",
|
||
"Zuluftfilter 0.0 0.0 0.0 \n",
|
||
"klemmt 0.0 0.0 0.0 \n",
|
||
"Komminikation 0.0 0.0 0.0 \n",
|
||
"Doppelholztische 0.0 0.0 0.0 \n",
|
||
"... ... ... ... \n",
|
||
"Mitarbeiterin 0.0 0.0 0.0 \n",
|
||
"Nägel 0.0 0.0 0.0 \n",
|
||
"WZ 0.0 0.0 0.0 \n",
|
||
"ExSchutzAnlage 0.0 0.0 0.0 \n",
|
||
"Gemisch 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" Mitarbeiterin Nägel WZ ExSchutzAnlage Gemisch \n",
|
||
"Verstärkung 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Zuluftfilter 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"klemmt 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Komminikation 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Doppelholztische 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"Mitarbeiterin 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Nägel 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"WZ 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"ExSchutzAnlage 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"Gemisch 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
"[6951 rows x 6951 columns]"
|
||
]
|
||
},
|
||
"execution_count": 640,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"thresh_cos_mat"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 641,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"COS_MAT_PATH_CSV = f'./Graphanalyse_Gruppen/cos_mat_Wthresh_{WEIGHT_THRESHOLD}_Cthresh{int(COS_THRESHOLD*100)}.csv'\n",
|
||
"thresh_cos_mat.to_csv(path_or_buf=COS_MAT_PATH_CSV, encoding='cp1252', sep=';')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|