improved imports, dummy dataset generation

This commit is contained in:
Florian Förster
2024-08-07 20:06:06 +02:00
parent 3f58a14852
commit 9328c0218a
35 changed files with 1966 additions and 106 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Betriebssicherheitsüberprüfung\"\n",
"text3 = \"Ich habe die Betriebssicherheitsüberprüfung durchgeführt.\"\n",
"text2 = \"die Betriebssicherheitsüberprüfung durchgeführt\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
"text3 = \"3-monatliche Sichtkontrolle / Reinigung\"\n",
"text2 = \"Wöchentliche Sichtkontrolle / Reinigun\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
"text3 = \"Tägliche Kontrolle der Wasseraufbereitungsanlagen\"\n",
"text2 = \"Wöchentliche Kontrolle der Wasseraufbereitungsanlagen\"\n",
"text4 = \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#text1 = 'Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers\\n'\n",
"#text3 = 'Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\foersterflorian\\mambaforge\\envs\\test\\Lib\\site-packages\\torch\\_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
" return self.fget.__get__(instance, owner)()\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cosine-Similarity t1+2: tensor([[0.4740]])\n",
"Cosine-Similarity t1+3: tensor([[0.4360]])\n",
"Cosine-Similarity t2+3: tensor([[0.9494]])\n",
"Cosine-Similarity t2+4: tensor([[0.7007]])\n"
]
},
{
"data": {
"text/plain": [
"'\\n# Print the embeddings\\nfor sentence, embedding in zip(sentences, sentence_embeddings):\\n print(\"Sentence:\", sentence)\\n print(\"Embedding:\", embedding)\\n print(\"\")\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sentence_transformers import SentenceTransformer, util\n",
"\n",
"#model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"#model = SentenceTransformer(\"all-mpnet-base-v2 \")\n",
"model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n",
"\n",
"# Our sentences we like to encode\n",
"\"\"\"\n",
"sentences = [\n",
" \"This framework generates embeddings for each input sentence\",\n",
" \"Sentences are passed as a list of string.\",\n",
" \"The quick brown fox jumps over the lazy dog.\",\n",
"]\n",
"\"\"\"\n",
"\n",
"# Sentences are encoded by calling model.encode()\n",
"sentence_embeddings = model.encode(sentences)\n",
"t1 = model.encode(text1)\n",
"t2 = model.encode(text2)\n",
"t3 = model.encode(text3)\n",
"t4 = model.encode(text4)\n",
"\n",
"cos_sim = util.cos_sim(t1, t2)\n",
"print(\"Cosine-Similarity t1+2:\", cos_sim)\n",
"cos_sim = util.cos_sim(t1, t3)\n",
"print(\"Cosine-Similarity t1+3:\", cos_sim)\n",
"cos_sim = util.cos_sim(t2, t3)\n",
"print(\"Cosine-Similarity t2+3:\", cos_sim)\n",
"cos_sim = util.cos_sim(t2, t4)\n",
"print(\"Cosine-Similarity t2+4:\", cos_sim)\n",
"\n",
"\"\"\"\n",
"# Print the embeddings\n",
"for sentence, embedding in zip(sentences, sentence_embeddings):\n",
" print(\"Sentence:\", sentence)\n",
" print(\"Embedding:\", embedding)\n",
" print(\"\")\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numpy.ndarray"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(t4)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"isinstance(model, int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7007368206977844"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_sim.item()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cosine-Similarity: tensor([[0.6153]])\n"
]
}
],
"source": [
"from sentence_transformers import SentenceTransformer, util\n",
"\n",
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"\n",
"# Sentences are encoded by calling model.encode()\n",
"emb1 = model.encode(\"This is a red cat with a hat.\")\n",
"emb2 = model.encode(\"Have you seen my red cat?\")\n",
"\n",
"cos_sim = util.cos_sim(emb1, emb2)\n",
"print(\"Cosine-Similarity:\", cos_sim)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

128
notebooks/lang_main.xml Normal file
View File

@@ -0,0 +1,128 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<vizmap id="VizMap-2024_07_12-08_08" documentVersion="3.1">
<visualStyle name="lang_main">
<network>
<visualProperty default="0.0" name="NETWORK_CENTER_X_LOCATION"/>
<visualProperty default="0.0" name="NETWORK_CENTER_Y_LOCATION"/>
<visualProperty default="0.0" name="NETWORK_CENTER_Z_LOCATION"/>
<visualProperty default="false" name="NETWORK_ANNOTATION_SELECTION"/>
<visualProperty default="1.0" name="NETWORK_SCALE_FACTOR"/>
<visualProperty default="false" name="NETWORK_NODE_LABEL_SELECTION"/>
<visualProperty default="400.0" name="NETWORK_HEIGHT"/>
<visualProperty default="true" name="NETWORK_NODE_SELECTION"/>
<visualProperty default="550.0" name="NETWORK_WIDTH"/>
<visualProperty default="0.0" name="NETWORK_DEPTH"/>
<visualProperty default="false" name="NETWORK_FORCE_HIGH_DETAIL"/>
<visualProperty default="" name="NETWORK_TITLE"/>
<visualProperty default="true" name="NETWORK_EDGE_SELECTION"/>
<visualProperty default="#F7FFFF" name="NETWORK_BACKGROUND_PAINT"/>
</network>
<node>
<dependency value="true" name="nodeCustomGraphicsSizeSync"/>
<dependency value="true" name="nodeSizeLocked"/>
<visualProperty default="ROUND_RECTANGLE" name="NODE_LABEL_BACKGROUND_SHAPE"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_9"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_7"/>
<visualProperty default="true" name="NODE_NESTED_NETWORK_IMAGE_VISIBLE"/>
<visualProperty default="0.0" name="NODE_LABEL_ROTATION"/>
<visualProperty default="175" name="NODE_LABEL_BACKGROUND_TRANSPARENCY"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_8"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_2"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_6"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_7"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_1"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_4"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_9"/>
<visualProperty default="ROUND_RECTANGLE" name="COMPOUND_NODE_SHAPE"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_5"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_9"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_5"/>
<visualProperty default="10.0" name="COMPOUND_NODE_PADDING"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_3"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_6"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_8"/>
<visualProperty default="SE,NW,c,-2.00,3.00" name="NODE_LABEL_POSITION"/>
<visualProperty default="ELLIPSE" name="NODE_SHAPE"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_3"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_4"/>
<visualProperty default="SansSerif.plain,plain,12" name="NODE_LABEL_FONT_FACE"/>
<visualProperty default="#D1F5BE" name="NODE_BORDER_PAINT"/>
<visualProperty default="40.0" name="NODE_HEIGHT"/>
<visualProperty default="255" name="NODE_LABEL_TRANSPARENCY"/>
<visualProperty default="#E1E1E1" name="NODE_LABEL_BACKGROUND_COLOR"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_6"/>
<visualProperty default="false" name="NODE_SELECTED"/>
<visualProperty default="0.0" name="NODE_DEPTH"/>
<visualProperty default="SOLID" name="NODE_BORDER_STROKE"/>
<visualProperty default="" name="NODE_TOOLTIP"/>
<visualProperty default="7.0" name="NODE_BORDER_WIDTH"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_2"/>
<visualProperty default="#A63C06" name="NODE_LABEL_COLOR"/>
<visualProperty default="0.0" name="NODE_X_LOCATION"/>
<visualProperty default="18.0" name="NODE_SIZE"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_8"/>
<visualProperty default="0.0" name="NODE_Z_LOCATION"/>
<visualProperty default="#FE9929" name="NODE_FILL_COLOR"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_1"/>
<visualProperty default="255" name="NODE_BORDER_TRANSPARENCY"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_1"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_2"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_3"/>
<visualProperty default="60.0" name="NODE_WIDTH"/>
<visualProperty default="" name="NODE_LABEL">
<passthroughMapping attributeName="name" attributeType="string"/>
</visualProperty>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_5"/>
<visualProperty default="500.0" name="NODE_LABEL_WIDTH"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_4"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_7"/>
<visualProperty default="#FFFF00" name="NODE_SELECTED_PAINT"/>
<visualProperty default="0.0" name="NODE_Y_LOCATION"/>
<visualProperty default="true" name="NODE_VISIBLE"/>
<visualProperty default="255" name="NODE_TRANSPARENCY"/>
<visualProperty default="14" name="NODE_LABEL_FONT_SIZE"/>
</node>
<edge>
<dependency value="true" name="arrowColorMatchesEdge"/>
<visualProperty default="false" name="EDGE_SELECTED"/>
<visualProperty default="255" name="EDGE_TRANSPARENCY"/>
<visualProperty default="10" name="EDGE_LABEL_FONT_SIZE"/>
<visualProperty default="#577399" name="EDGE_UNSELECTED_PAINT"/>
<visualProperty default="" name="EDGE_LABEL"/>
<visualProperty default="#FFFFFF" name="EDGE_STROKE_UNSELECTED_PAINT"/>
<visualProperty default="200.0" name="EDGE_LABEL_WIDTH"/>
<visualProperty default="#000000" name="EDGE_LABEL_COLOR"/>
<visualProperty default="SansSerif.plain,plain,10" name="EDGE_LABEL_FONT_FACE"/>
<visualProperty default="0.728545744495502,-0.684997151948455,0.6456513365424503" name="EDGE_BEND"/>
<visualProperty default="#B6B6B6" name="EDGE_LABEL_BACKGROUND_COLOR"/>
<visualProperty default="AUTO_BEND" name="EDGE_STACKING"/>
<visualProperty default="#000000" name="EDGE_TARGET_ARROW_UNSELECTED_PAINT"/>
<visualProperty default="0.5" name="EDGE_STACKING_DENSITY"/>
<visualProperty default="NONE" name="EDGE_TARGET_ARROW_SHAPE"/>
<visualProperty default="true" name="EDGE_VISIBLE"/>
<visualProperty default="C,C,c,0.00,0.00" name="EDGE_LABEL_POSITION"/>
<visualProperty default="0.0" name="EDGE_LABEL_ROTATION"/>
<visualProperty default="" name="EDGE_TOOLTIP"/>
<visualProperty default="0.0" name="EDGE_Z_ORDER"/>
<visualProperty default="#FFFF00" name="EDGE_TARGET_ARROW_SELECTED_PAINT"/>
<visualProperty default="#FF0000" name="EDGE_STROKE_SELECTED_PAINT"/>
<visualProperty default="NONE" name="EDGE_SOURCE_ARROW_SHAPE"/>
<visualProperty default="#FFFF00" name="EDGE_SOURCE_ARROW_SELECTED_PAINT"/>
<visualProperty default="false" name="EDGE_LABEL_AUTOROTATE"/>
<visualProperty default="true" name="EDGE_CURVED"/>
<visualProperty default="#000000" name="EDGE_SOURCE_ARROW_UNSELECTED_PAINT"/>
<visualProperty default="255" name="EDGE_LABEL_TRANSPARENCY"/>
<visualProperty default="6.0" name="EDGE_TARGET_ARROW_SIZE"/>
<visualProperty default="NONE" name="EDGE_LABEL_BACKGROUND_SHAPE"/>
<visualProperty default="255" name="EDGE_LABEL_BACKGROUND_TRANSPARENCY"/>
<visualProperty default="SOLID" name="EDGE_LINE_TYPE"/>
<visualProperty default="6.0" name="EDGE_SOURCE_ARROW_SIZE"/>
<visualProperty default="3.0" name="EDGE_WIDTH">
<continuousMapping attributeName="weight" attributeType="float">
<continuousMappingPoint attrValue="0.09520000219345093" equalValue="2.0" greaterValue="2.0" lesserValue="1.0"/>
<continuousMappingPoint attrValue="1.0" equalValue="10.0" greaterValue="1.0" lesserValue="10.0"/>
</continuousMapping>
</visualProperty>
</edge>
</visualStyle>
</vizmap>

View File

@@ -0,0 +1,59 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = '../scripts/results/test_20240619/'
dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
time_analysis_skip = true
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

10689
notebooks/misc.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,123 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<vizmap id="VizMap-2024_07_10-16_50" documentVersion="3.1">
<visualStyle name="template">
<network>
<visualProperty default="1.0" name="NETWORK_SCALE_FACTOR"/>
<visualProperty default="true" name="NETWORK_NODE_SELECTION"/>
<visualProperty default="#F7FFFF" name="NETWORK_BACKGROUND_PAINT"/>
<visualProperty default="false" name="NETWORK_ANNOTATION_SELECTION"/>
<visualProperty default="false" name="NETWORK_NODE_LABEL_SELECTION"/>
<visualProperty default="" name="NETWORK_TITLE"/>
<visualProperty default="0.0" name="NETWORK_CENTER_X_LOCATION"/>
<visualProperty default="true" name="NETWORK_EDGE_SELECTION"/>
<visualProperty default="550.0" name="NETWORK_WIDTH"/>
<visualProperty default="0.0" name="NETWORK_DEPTH"/>
<visualProperty default="400.0" name="NETWORK_HEIGHT"/>
<visualProperty default="0.0" name="NETWORK_CENTER_Z_LOCATION"/>
<visualProperty default="0.0" name="NETWORK_CENTER_Y_LOCATION"/>
<visualProperty default="false" name="NETWORK_FORCE_HIGH_DETAIL"/>
</network>
<node>
<dependency value="true" name="nodeCustomGraphicsSizeSync"/>
<dependency value="true" name="nodeSizeLocked"/>
<visualProperty default="0.0" name="NODE_LABEL_ROTATION"/>
<visualProperty default="14" name="NODE_LABEL_FONT_SIZE"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_3"/>
<visualProperty default="10.0" name="COMPOUND_NODE_PADDING"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_6"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_1"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_9"/>
<visualProperty default="0.0" name="NODE_Z_LOCATION"/>
<visualProperty default="true" name="NODE_VISIBLE"/>
<visualProperty default="" name="NODE_TOOLTIP"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_1"/>
<visualProperty default="500.0" name="NODE_LABEL_WIDTH"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_5"/>
<visualProperty default="#FE9929" name="NODE_FILL_COLOR"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_2"/>
<visualProperty default="#A63C06" name="NODE_LABEL_COLOR"/>
<visualProperty default="0.0" name="NODE_DEPTH"/>
<visualProperty default="7.0" name="NODE_BORDER_WIDTH"/>
<visualProperty default="#FFFF00" name="NODE_SELECTED_PAINT"/>
<visualProperty default="60.0" name="NODE_WIDTH"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_3"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_7"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_4"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_1"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_8"/>
<visualProperty default="SE,NW,c,-2.00,3.00" name="NODE_LABEL_POSITION"/>
<visualProperty default="SOLID" name="NODE_BORDER_STROKE"/>
<visualProperty default="255" name="NODE_BORDER_TRANSPARENCY"/>
<visualProperty default="ROUND_RECTANGLE" name="NODE_LABEL_BACKGROUND_SHAPE"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_8"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_7"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_8"/>
<visualProperty default="18.0" name="NODE_SIZE"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_5"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_4"/>
<visualProperty default="" name="NODE_LABEL">
<passthroughMapping attributeName="name" attributeType="string"/>
</visualProperty>
<visualProperty default="255" name="NODE_LABEL_TRANSPARENCY"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_6"/>
<visualProperty default="ELLIPSE" name="NODE_SHAPE"/>
<visualProperty default="#D1F5BE" name="NODE_BORDER_PAINT"/>
<visualProperty default="true" name="NODE_NESTED_NETWORK_IMAGE_VISIBLE"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_7"/>
<visualProperty default="false" name="NODE_SELECTED"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_9"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_3"/>
<visualProperty default="SansSerif.plain,plain,12" name="NODE_LABEL_FONT_FACE"/>
<visualProperty default="C,C,c,0.00,0.00" name="NODE_CUSTOMGRAPHICS_POSITION_2"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_4"/>
<visualProperty default="#E1E1E1" name="NODE_LABEL_BACKGROUND_COLOR"/>
<visualProperty default="0.0" name="NODE_X_LOCATION"/>
<visualProperty default="org.cytoscape.cg.model.NullCustomGraphics,0,[ Remove Graphics ]," name="NODE_CUSTOMGRAPHICS_2"/>
<visualProperty default="ROUND_RECTANGLE" name="COMPOUND_NODE_SHAPE"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_6"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_5"/>
<visualProperty default="0.0" name="NODE_CUSTOMGRAPHICS_SIZE_9"/>
<visualProperty default="175" name="NODE_LABEL_BACKGROUND_TRANSPARENCY"/>
<visualProperty default="255" name="NODE_TRANSPARENCY"/>
<visualProperty default="40.0" name="NODE_HEIGHT"/>
<visualProperty default="0.0" name="NODE_Y_LOCATION"/>
</node>
<edge>
<dependency value="true" name="arrowColorMatchesEdge"/>
<visualProperty default="NONE" name="EDGE_LABEL_BACKGROUND_SHAPE"/>
<visualProperty default="" name="EDGE_TOOLTIP"/>
<visualProperty default="AUTO_BEND" name="EDGE_STACKING"/>
<visualProperty default="#B6B6B6" name="EDGE_LABEL_BACKGROUND_COLOR"/>
<visualProperty default="C,C,c,0.00,0.00" name="EDGE_LABEL_POSITION"/>
<visualProperty default="0.728545744495502,-0.684997151948455,0.6456513365424503" name="EDGE_BEND"/>
<visualProperty default="10" name="EDGE_LABEL_FONT_SIZE"/>
<visualProperty default="NONE" name="EDGE_TARGET_ARROW_SHAPE"/>
<visualProperty default="false" name="EDGE_SELECTED"/>
<visualProperty default="#000000" name="EDGE_LABEL_COLOR"/>
<visualProperty default="#FFFFFF" name="EDGE_STROKE_UNSELECTED_PAINT"/>
<visualProperty default="#000000" name="EDGE_TARGET_ARROW_UNSELECTED_PAINT"/>
<visualProperty default="255" name="EDGE_LABEL_TRANSPARENCY"/>
<visualProperty default="255" name="EDGE_LABEL_BACKGROUND_TRANSPARENCY"/>
<visualProperty default="true" name="EDGE_CURVED"/>
<visualProperty default="NONE" name="EDGE_SOURCE_ARROW_SHAPE"/>
<visualProperty default="0.0" name="EDGE_LABEL_ROTATION"/>
<visualProperty default="SansSerif.plain,plain,10" name="EDGE_LABEL_FONT_FACE"/>
<visualProperty default="0.5" name="EDGE_STACKING_DENSITY"/>
<visualProperty default="#FFFF00" name="EDGE_SOURCE_ARROW_SELECTED_PAINT"/>
<visualProperty default="false" name="EDGE_LABEL_AUTOROTATE"/>
<visualProperty default="3.0" name="EDGE_WIDTH"/>
<visualProperty default="#FF0000" name="EDGE_STROKE_SELECTED_PAINT"/>
<visualProperty default="true" name="EDGE_VISIBLE"/>
<visualProperty default="#577399" name="EDGE_UNSELECTED_PAINT"/>
<visualProperty default="#000000" name="EDGE_SOURCE_ARROW_UNSELECTED_PAINT"/>
<visualProperty default="" name="EDGE_LABEL"/>
<visualProperty default="255" name="EDGE_TRANSPARENCY"/>
<visualProperty default="SOLID" name="EDGE_LINE_TYPE"/>
<visualProperty default="6.0" name="EDGE_TARGET_ARROW_SIZE"/>
<visualProperty default="200.0" name="EDGE_LABEL_WIDTH"/>
<visualProperty default="0.0" name="EDGE_Z_ORDER"/>
<visualProperty default="6.0" name="EDGE_SOURCE_ARROW_SIZE"/>
<visualProperty default="#FFFF00" name="EDGE_TARGET_ARROW_SELECTED_PAINT"/>
</edge>
</visualStyle>
</vizmap>

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,824 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "79034f9b-adae-4066-a35f-b0e7fd38055f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n",
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n",
"INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from ihm_analyse.lib.preprocess import load_raw_data\n",
"from ihm_analyse import load_pickle\n",
"from ihm_analyse.predefined_pipes import pipe_merge"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "af94968f-ae6c-402b-b866-cb6c15b81cef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pwd = os.getcwd()\n",
"pwd = Path(pwd)\n",
"p = pwd / '01_03_Rohdaten_202403/'\n",
"p"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "753daf9e-0209-4a13-b458-1048c8b2bfbf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export6 - 43306 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export8 - 708 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export9 - 8176 Zeilen.csv'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_trunc.csv')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"folder = list(p.glob(r'*.csv'))\n",
"folder"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50d9ef9c-c56b-4d5b-9dfd-c02b68a29288",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "52186a59-69f2-4ed2-8d19-dac76e50526a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"id": "5b76284b-bcc3-4b31-9ece-bde35b22b717",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_59499_Zeilen.csv')"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_dataset = folder[1]\n",
"path_to_dataset"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2701b8d9-657c-4d7a-b103-b8c8b1865224",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.preprocess:Loaded dataset successfully.\n",
"INFO:ihm_analyse.preprocess:Dataset properties: number of entries: 59499, number of features 20\n"
]
}
],
"source": [
"(data,) = load_raw_data(path_to_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "74d7b72e-3cab-46e2-bca2-67c70b9221c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17849"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"percentage_trunc = 0.3\n",
"num_entries_trunc = int(len(data) * percentage_trunc)\n",
"num_entries_trunc"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "62d2fa0e-baa6-4d7c-bd37-5fdbe21005d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 17849 entries, 0 to 17848\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 VorgangsID 17849 non-null int64 \n",
" 1 ObjektID 17849 non-null int64 \n",
" 2 HObjektText 17848 non-null object \n",
" 3 ObjektArtID 17849 non-null int64 \n",
" 4 ObjektArtText 17849 non-null object \n",
" 5 VorgangsTypID 17849 non-null int64 \n",
" 6 VorgangsTypName 17849 non-null object \n",
" 7 VorgangsDatum 17849 non-null datetime64[ns]\n",
" 8 VorgangsStatusId 17849 non-null int64 \n",
" 9 VorgangsPrioritaet 17849 non-null int64 \n",
" 10 VorgangsBeschreibung 15988 non-null object \n",
" 11 VorgangsOrt 0 non-null float64 \n",
" 12 VorgangsArtText 17849 non-null object \n",
" 13 ErledigungsDatum 17849 non-null datetime64[ns]\n",
" 14 ErledigungsArtText 11879 non-null object \n",
" 15 ErledigungsBeschreibung 9916 non-null object \n",
" 16 MPMelderArbeitsplatz 3 non-null object \n",
" 17 MPAbteilungBezeichnung 3 non-null object \n",
" 18 Arbeitsbeginn 1920 non-null datetime64[ns]\n",
" 19 ErstellungsDatum 17849 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](4), float64(1), int64(6), object(9)\n",
"memory usage: 2.7+ MB\n"
]
}
],
"source": [
"data_trunc = data.iloc[:num_entries_trunc].copy()\n",
"data_trunc.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "6a15fb8c-e3b7-4c92-b73d-788b337d6251",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/01_03_Rohdaten_202403/Export7_trunc.csv')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"saving_path = p / 'Export7_trunc.csv'\n",
"saving_path"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "fb912634-cefa-4b8d-a370-37f6c8178f5a",
"metadata": {},
"outputs": [],
"source": [
"data_trunc.to_csv(\n",
" path_or_buf=saving_path,\n",
" sep=';', \n",
" encoding='cp1252', \n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30085691-aa23-478c-8d65-d3e6800c7c77",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "85733e19-6c52-479c-a8f0-3872bdbd5bfd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae65a019-26a8-45c9-bfb9-2662b84ff2f2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ba722ae-51b8-4e8d-9a1a-917098a3f70e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"id": "356f7d32-446e-4dc1-aa83-a0b816742087",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-3_remove_NA.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-5_analyse_feature.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-1_build_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Token_Analysis_Step-1_build_token_graph.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_path = pwd / 'results/Export7_trunc/'\n",
"contents = list(res_path.glob(r'*.pickle'))\n",
"contents"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "e4415e9c-6ebb-46d9-b06a-eb67df56689e",
"metadata": {},
"outputs": [],
"source": [
"preproc_data = contents[1]\n",
"last_step = contents[-1]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "2b29672a-c573-4d09-8601-e468a23bad0c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n",
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"ret_preproc_data = load_pickle(preproc_data)\n",
"ret_idx_paris = load_pickle(last_step)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "20c807da-e64f-4a48-8306-28a0a3dcfae9",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[53], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m idx_pairs \u001b[38;5;241m=\u001b[39m \u001b[43mret_idx_paris\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 2\u001b[0m preproc_data \u001b[38;5;241m=\u001b[39m ret_preproc_data[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[1;32m~\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\networkx\\classes\\graph.py:513\u001b[0m, in \u001b[0;36mGraph.__getitem__\u001b[1;34m(self, n)\u001b[0m\n\u001b[0;32m 489\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, n):\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a dict of neighbors of node n. Use: 'G[n]'.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 511\u001b[0m \u001b[38;5;124;03m AtlasView({1: {}})\u001b[39;00m\n\u001b[0;32m 512\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madj\u001b[49m\u001b[43m[\u001b[49m\u001b[43mn\u001b[49m\u001b[43m]\u001b[49m\n",
"File \u001b[1;32m~\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\networkx\\classes\\coreviews.py:81\u001b[0m, in \u001b[0;36mAdjacencyView.__getitem__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, name):\n\u001b[1;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m AtlasView(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_atlas\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m)\n",
"\u001b[1;31mKeyError\u001b[0m: 0"
]
}
],
"source": [
"idx_pairs = ret_idx_paris[0]\n",
"preproc_data = ret_preproc_data[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1fca8442-c0e9-420f-9ad4-22a3b672dda3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.pipelines:Starting processing pipeline >>Merge_Duplicates<<...\n",
"INFO:ihm_analyse.preprocess:Start merging of similarity candidates...\n",
"INFO:ihm_analyse.graphs:Graph properties: 5465 Nodes, 71087 Edges\n",
"INFO:ihm_analyse.graphs:Node memory: 149.43 KB\n",
"INFO:ihm_analyse.graphs:Edge memory: 3887.57 KB\n",
"INFO:ihm_analyse.graphs:Total memory: 4037.00 KB\n",
"INFO:ihm_analyse.preprocess:Similarity candidates merged successfully.\n",
"INFO:ihm_analyse.helpers:Saved file successfully under results\\Export7_trunc\\Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle\n",
"INFO:ihm_analyse.pipelines:Processing pipeline >>Merge_Duplicates<< successfully ended.\n"
]
}
],
"source": [
"ret = pipe_merge.run(starting_values=(preproc_data, idx_pairs))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "87bd8bba-b0c3-45a1-a9a8-b6bd279cf51f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>445</th>\n",
" <td>Wartung nach Arbeitsplan, siehe Extradaten / A...</td>\n",
" <td>52</td>\n",
" <td>3435</td>\n",
" <td>[563, 604, 616, 617, 15089, 15226, 15276, 1533...</td>\n",
" <td>36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>I/W nach Liste</td>\n",
" <td>14</td>\n",
" <td>238</td>\n",
" <td>[2363, 2364, 2367, 2368, 2369, 2370, 2371, 237...</td>\n",
" <td>85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2377</th>\n",
" <td>1 Wöchentliche Wartung aller 3 Etikettendrucke...</td>\n",
" <td>91</td>\n",
" <td>535</td>\n",
" <td>[111, 121, 127, 209, 219, 220, 221, 222, 236, ...</td>\n",
" <td>73</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2380</th>\n",
" <td>Infratech Meet Di + DO JourFix PT/InT</td>\n",
" <td>38</td>\n",
" <td>183</td>\n",
" <td>28526</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4337</th>\n",
" <td>24.05.2022 10:28:01 (Halm, Karl-Josef) Aktione...</td>\n",
" <td>579</td>\n",
" <td>3817</td>\n",
" <td>[5, 7, 9, 13, 14, 15, 17, 18, 24, 25, 30, 32, ...</td>\n",
" <td>754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3061</th>\n",
" <td>stopper schaltet nicht.</td>\n",
" <td>23</td>\n",
" <td>1</td>\n",
" <td>[15280]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3059</th>\n",
" <td>12.09.2022 13:48:24 (Struzyna, Christian) Temp...</td>\n",
" <td>127</td>\n",
" <td>1</td>\n",
" <td>[12671]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3054</th>\n",
" <td>08.09.2022 12:56:33 (Unruh, Jakob) Neue Serie ...</td>\n",
" <td>262</td>\n",
" <td>1</td>\n",
" <td>[273]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3053</th>\n",
" <td>Preset-Punkt überprüfen und ggf. nachjustieren...</td>\n",
" <td>148</td>\n",
" <td>1</td>\n",
" <td>[273]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3050</th>\n",
" <td>13.09.2022 08:05:40 (Betke, Gennadi) Griefer ...</td>\n",
" <td>79</td>\n",
" <td>1</td>\n",
" <td>[15785]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3627 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry len num_occur \\\n",
"445 Wartung nach Arbeitsplan, siehe Extradaten / A... 52 3435 \n",
"26 I/W nach Liste 14 238 \n",
"2377 1 Wöchentliche Wartung aller 3 Etikettendrucke... 91 535 \n",
"2380 Infratech Meet Di + DO JourFix PT/InT 38 183 \n",
"4337 24.05.2022 10:28:01 (Halm, Karl-Josef) Aktione... 579 3817 \n",
"... ... ... ... \n",
"3061 stopper schaltet nicht. 23 1 \n",
"3059 12.09.2022 13:48:24 (Struzyna, Christian) Temp... 127 1 \n",
"3054 08.09.2022 12:56:33 (Unruh, Jakob) Neue Serie ... 262 1 \n",
"3053 Preset-Punkt überprüfen und ggf. nachjustieren... 148 1 \n",
"3050 13.09.2022 08:05:40 (Betke, Gennadi) Griefer ... 79 1 \n",
"\n",
" assoc_obj_ids num_assoc_obj_ids \n",
"445 [563, 604, 616, 617, 15089, 15226, 15276, 1533... 36 \n",
"26 [2363, 2364, 2367, 2368, 2369, 2370, 2371, 237... 85 \n",
"2377 [111, 121, 127, 209, 219, 220, 221, 222, 236, ... 73 \n",
"2380 28526 1 \n",
"4337 [5, 7, 9, 13, 14, 15, 17, 18, 24, 25, 30, 32, ... 754 \n",
"... ... ... \n",
"3061 [15280] 1 \n",
"3059 [12671] 1 \n",
"3054 [273] 1 \n",
"3053 [273] 1 \n",
"3050 [15785] 1 \n",
"\n",
"[3627 rows x 5 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43e6d41c-7a49-4756-9629-0ec0ee6c5b7c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1a9969fa-6b0d-466a-bd4f-1ba5f4868873",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" int('23456')\n",
"except ValueError:\n",
" print('went wrong')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76af32de-5f0a-4d7e-9751-5f2a38a7a69e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7bd5cffa-0b09-45c7-bc15-0cd3082353d7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e8423609-c95d-42c8-99f3-95274fa52ae8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-3_remove_NA.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-TargetFeature_Step-5_analyse_feature.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-1_build_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-2_filt_thresh_cosSim_matrix.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Embedding1_Step-3_list_cosSim_dupl_candidates.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Pipe-Token_Analysis_Step-1_build_token_graph.pickle'),\n",
" WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_path = pwd / 'results/Export7_trunc/'\n",
"contents = list(res_path.glob(r'*.pickle'))\n",
"contents"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "71fa1c2e-22cf-483a-964c-a5cca2bd3790",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc/Token_Analysis-TokenGraph.pickle')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_graph = contents[-1]\n",
"path_to_graph"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9101beaf-6a7c-4987-9c44-141386966291",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded file successfully.\n"
]
}
],
"source": [
"tk_graph = load_pickle(path_to_graph)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e08954c3-9a5f-43c8-a98e-f9b8a74c3ff5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TokenGraph(name: TokenGraph, number of nodes: 10536, number of edges: 48562)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2a8d0abb-d68b-4c6e-80e9-b3b27998c8d2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 10536,\n",
" 'num_edges': 46393,\n",
" 'min_edge_weight': 1,\n",
" 'max_edge_weight': 15374,\n",
" 'node_memory': 652596,\n",
" 'edge_memory': 2598008,\n",
" 'total_memory': 3250604}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph.metadata_undirected"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "cc34c667-5a33-4061-a83a-50fc8c537b19",
"metadata": {},
"outputs": [],
"source": [
"tk_graph_filtered = tk_graph.filter_by_edge_weight(100)\n",
"tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "67df971c-fe7a-4f88-89ae-ba1366da1166",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_nodes': 289,\n",
" 'num_edges': 457,\n",
" 'min_edge_weight': 100,\n",
" 'max_edge_weight': 15369,\n",
" 'node_memory': 17674,\n",
" 'edge_memory': 25592,\n",
" 'total_memory': 43266}"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tk_graph_filtered.metadata_undirected"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "8c524312-aff4-47f4-801e-ad8112aa2a70",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/Instandhaltung/results/Export7_trunc')"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"save_path_graph = res_path\n",
"save_path_graph"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "b62c888f-620d-4b29-924b-45ea17d99bc1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.graphs:Successfully saved graph as GraphML file under A:\\Arbeitsaufgaben\\Instandhaltung\\results\\Export7_trunc\\TokenGraph-filtered.graphml.\n"
]
}
],
"source": [
"tk_graph_filtered.save_graph(save_path_graph, filename='TokenGraph-filtered')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca21e2d3-dc5a-4117-8be9-d132ba2c8d28",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}