diff --git a/lang_main_config.toml b/lang_main_config.toml new file mode 100644 index 0000000..c694e25 --- /dev/null +++ b/lang_main_config.toml @@ -0,0 +1,56 @@ +# lang_main: Config file + +[paths] +inputs = './inputs/' +results = './results/test_new2/' +dataset = './01_2_Rohdaten_neu/Export4.csv' +#results = './results/Export7/' +#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' +#results = './results/Export7_trunc/' +#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' + +[control] +preprocessing = true +preprocessing_skip = false +token_analysis = false +token_analysis_skip = false +graph_postprocessing = false +graph_postprocessing_skip = false +time_analysis = false +time_analysis_skip = false + +#[export_filenames] +#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' + +[preprocess] +filename_cossim_filter_candidates = 'CosSim-FilterCandidates' +date_cols = [ + "VorgangsDatum", + "ErledigungsDatum", + "Arbeitsbeginn", + "ErstellungsDatum", +] +threshold_amount_characters = 5 +threshold_similarity = 0.8 + +[graph_postprocessing] +threshold_edge_weight = 150 + +[time_analysis.uniqueness] +threshold_unique_texts = 4 +criterion_feature = 'HObjektText' +feature_name_obj_id = 'ObjektID' + +[time_analysis.model_input] +input_features = [ + 'VorgangsTypName', + 'VorgangsArtText', + 'VorgangsBeschreibung', +] +activity_feature = 'VorgangsTypName' +activity_types = [ + 'Reparaturauftrag (Portal)', + 'Störungsmeldung', +] +threshold_num_acitivities = 1 +threshold_similarity = 0.8 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e092c6f..c59bde7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "lang-main" -version = "0.1.0" +version = "0.1.0dev1" description = "Several tools to analyse maintenance data with strong focus on language processing" authors = [ {name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"}, @@ -40,6 +40,7 @@ trials = [ line-length = 94 indent-width = 4 target-version = "py311" +src = ["src"] [tool.ruff.format] quote-style = "single" diff --git a/scripts/dashboard/cyto.py b/scripts/dashboard/cyto.py index 919655b..b70270e 100644 --- a/scripts/dashboard/cyto.py +++ b/scripts/dashboard/cyto.py @@ -3,26 +3,74 @@ import webbrowser from pathlib import Path from threading import Thread from typing import cast +import copy import dash_cytoscape as cyto -import lang_main.io from dash import Dash, Input, Output, State, dcc, html +from dash.exceptions import PreventUpdate + +import lang_main.io from lang_main.analysis import graphs target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl' p = Path(target).resolve() ret = lang_main.io.load_pickle(p) tk_graph = cast(graphs.TokenGraph, ret[0]) -tk_graph_filtered = tk_graph.filter_by_edge_weight(150) -tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) -cyto_data, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered) +tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None) +tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None) +cyto_data_base, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered) MIN_WEIGHT = weight_data['min'] MAX_WEIGHT = weight_data['max'] cyto.load_extra_layouts() -app = Dash(__name__) +external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] +app = Dash(__name__, external_stylesheets=external_stylesheets) + +cose_layout = { + 'name': 'cose', + 'nodeOverlap': 20, + 'refresh': 20, + 'fit': True, + 'padding': 30, + 'randomize': True, + 'componentSpacing': 40, + 'nodeRepulsion': 2000, + 'edgeElasticity': 1000, + 'idealEdgeLength': 100, + 'nestingFactor': 1.2, + 'gravity': 50, + 'numIter': 2000, + 'initialTemp': 1000, + 'coolingFactor': 0.95, + 'minTemp': 1.0, + 'nodeDimensionsIncludeLabels': True, +} + +cose_bilkent_layout = { + 'name': 'cose-bilkent', + 'nodeDimensionsIncludeLabels': True, + 'idealEdgeLength': 100, + 'edgeElasticity': 0.45, + 'nodeRepulsion': 10000, + 'nestingFactor': 0.1, + 'gravity': 0.25, + 'numIter': 2500, + 'initialTemp': 1000, + 'coolingFactor': 0.95, + 'minTemp': 1.0, +} + +cola_layout = { + 'name': 'cola', + 'nodeDimensionsIncludeLabels': True, + 'nodeSpacing': 30, + 'edgeLength': 45, + 'animate': True, + 'centerGraph': True, + 'randomize': False, +} my_stylesheet = [ # Group selectors @@ -45,7 +93,10 @@ my_stylesheet = [ { 'selector': 'edge', 'style': { - 'width': 2, + #'width': f'mapData(weight, {MIN_WEIGHT}, {MAX_WEIGHT}, 1, 10)', + # 'width': """function(ele) { + # return ele.data('weight'); + # """, 'curve-style': 'bezier', 'line-color': 'grey', 'line-style': 'solid', @@ -59,130 +110,201 @@ my_stylesheet = [ app.layout = html.Div( [ - html.Button('Reset', id='bt-reset'), + html.Button('Trigger JS Layout', id='test_js'), + html.Button('Trigger JS Weight', id='test_js_weight'), + html.Div(id='output'), + html.Div( + [ + html.H2('Token Graph', style={'margin': 0}), + html.Button( + 'Reset Default', + id='bt-reset', + style={ + 'marginLeft': 'auto', + 'width': '300px', + }, + ), + ], + style={ + 'display': 'flex', + 'marginBottom': '1em', + }, + ), + html.H3('Layout'), dcc.Dropdown( - id='layout_choice_internal', + id='layout_choice', options=[ - 'random', - 'grid', - 'circle', - 'concentric', - 'breadthfirst', 'cose', + 'cola', + 'euler', + 'random', ], value='cose', clearable=False, ), - dcc.Dropdown( - id='layout_choice_external', - options=[ - 'cose-bilkent', - 'cola', - 'euler', - 'spread', - 'dagre', - 'klay', + html.Div( + [ + html.H3('Graph Filter'), + dcc.Input( + id='weight_min', + type='number', + min=MIN_WEIGHT, + max=MAX_WEIGHT, + step=1, + placeholder=f'Minimum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}', + debounce=True, + style={'width': '40%'}, + ), + dcc.Input( + id='weight_max', + type='number', + min=MIN_WEIGHT, + max=MAX_WEIGHT, + step=1, + placeholder=f'Maximum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}', + debounce=True, + style={'width': '40%'}, + ), + html.H3('Graph'), + html.Div( + [ + cyto.Cytoscape( + id='cytoscape-graph', + style={'width': '100%', 'height': '600px'}, + stylesheet=my_stylesheet, + elements=cyto_data_base, + zoom=1, + ), + ], + style={ + 'border': '3px solid black', + 'borderRadius': '25px', + 'marginTop': '1em', + 'marginBottom': '2em', + 'padding': '7px', + }, + ), ], - clearable=False, + style={'marginTop': '1em'}, ), - dcc.RangeSlider( - id='weight_slider', - min=MIN_WEIGHT, - max=MAX_WEIGHT, - step=1000, - ), - cyto.Cytoscape( - id='cytoscape-graph', - layout={'name': 'cose'}, - style={'width': '100%', 'height': '600px'}, - stylesheet=my_stylesheet, - elements=cyto_data, - zoom=1, - ), - ] + ], + style={'margin': '2em'}, ) @app.callback( Output('cytoscape-graph', 'layout', allow_duplicate=True), - Input('layout_choice_internal', 'value'), + Input('layout_choice', 'value'), prevent_initial_call=True, ) def update_layout_internal(layout_choice): - return {'name': layout_choice} - - -@app.callback( - Output('cytoscape-graph', 'layout', allow_duplicate=True), - Input('layout_choice_external', 'value'), - prevent_initial_call=True, -) -def update_layout_external(layout_choice): - return {'name': layout_choice} + # return {'name': layout_choice} + return cose_layout + # return cose_bilkent_layout + # return cola_layout @app.callback( Output('cytoscape-graph', 'zoom'), - Output('cytoscape-graph', 'elements'), + Output('cytoscape-graph', 'elements', allow_duplicate=True), + Output('weight_min', 'value'), + Output('weight_max', 'value'), Input('bt-reset', 'n_clicks'), prevent_initial_call=True, ) def reset_layout(n_clicks): - return (1, cyto_data) + return (1, cyto_data_base, None, None) -# @app.callback( -# Output('cytoscape-graph', 'stylesheet'), -# Input('weight_slider', 'value'), -# State('cytoscape-graph', 'stylesheet'), +# update edge weight +@app.callback( + Output('cytoscape-graph', 'elements', allow_duplicate=True), + Input('weight_min', 'value'), + Input('weight_max', 'value'), + prevent_initial_call=True, +) +def update_edge_weight(weight_min, weight_max): + if not any([weight_min, weight_max]): + return cyto_data_base + + if weight_min is None: + weight_min = MIN_WEIGHT + if weight_max is None: + weight_max = MAX_WEIGHT + tk_graph_filtered = tk_graph.filter_by_edge_weight(weight_min, weight_max) + tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None) + cyto_data, _ = graphs.convert_graph_to_cytoscape(tk_graph_filtered) + return cyto_data + + +# app.clientside_callback( +# """ +# function(n_clicks, layout) { +# let threshold = 1000; +# layout.edgeLength = function(edge) { +# let weight = edge.data().weight; +# let length; +# if (weight > threshold) { +# length = 10; +# } else { +# length = 1000 / edge.data().weight; +# length = Math.max(20, length); +# } +# return length; +# }; +# cy.layout(layout).run(); +# return layout; +# } +# """, +# Output('cytoscape-graph', 'layout', allow_duplicate=True), +# Input('test_js', 'n_clicks'), +# State('cytoscape-graph', 'layout'), # prevent_initial_call=True, # ) -# def select_weight(range_chosen, stylesheet): -# min_weight, max_weight = range_chosen -# new_stylesheet = stylesheet.copy() -# new_stylesheet.append( -# { -# 'selector': f'[weight >= {min_weight}]', -# 'style': {'line-color': 'blue', 'line-style': 'dashed'}, -# } -# ) -# new_stylesheet.append( -# { -# 'selector': f'[weight <= {max_weight}]', -# 'style': {'line-color': 'blue', 'line-style': 'dashed'}, -# } -# ) -# return new_stylesheet +app.clientside_callback( + """ + function(n_clicks, layout) { + layout.edgeElasticity = function(edge) { + return edge.data().weight * 4; + }; + layout.idealEdgeLength = function(edge) { + return edge.data().weight * 0.8; + }; + cy.layout(layout).run(); + return layout; + } + """, + Output('cytoscape-graph', 'layout', allow_duplicate=True), + Input('test_js', 'n_clicks'), + State('cytoscape-graph', 'layout'), + prevent_initial_call=True, +) -# app.layout = html.Div( -# [ -# cyto.Cytoscape( -# id='cytoscape-two-nodes', -# layout={'name': 'preset'}, -# style={'width': '100%', 'height': '400px'}, -# stylesheet=my_stylesheet, -# elements=[ -# { -# 'data': { -# 'id': 'one', -# 'label': 'Titel 1', -# }, -# 'position': {'x': 75, 'y': 75}, -# 'grabbable': False, -# #'locked': True, -# 'classes': 'red', -# }, -# { -# 'data': {'id': 'two', 'label': 'Title 2'}, -# 'position': {'x': 200, 'y': 200}, -# 'classes': 'triangle', -# }, -# {'data': {'source': 'one', 'target': 'two', 'weight': 2000}}, -# ], -# ) -# ] -# ) +app.clientside_callback( + """ + function(n_clicks, stylesheet) { + function edge_weight(ele) { + let threshold = 1000; + let weight = ele.data('weight'); + if (weight > threshold) { + weight = 12; + } else { + weight = weight / threshold * 10; + weight = Math.max(1, weight); + } + return weight; + } + stylesheet[1].style.width = edge_weight; + cy.style(stylesheet).update(); + return stylesheet; + } + """, + Output('cytoscape-graph', 'stylesheet'), + Input('test_js_weight', 'n_clicks'), + State('cytoscape-graph', 'stylesheet'), + prevent_initial_call=False, +) def _start_webbrowser(): diff --git a/scripts/dashboard/test.py b/scripts/dashboard/test.py new file mode 100644 index 0000000..f287ba4 --- /dev/null +++ b/scripts/dashboard/test.py @@ -0,0 +1,18 @@ +from pathlib import Path +from typing import cast +import statistics + +import lang_main.io +from lang_main.analysis import graphs + +# target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl' +# p = Path(target).resolve() +# ret = lang_main.io.load_pickle(p) +# tk_graph = cast(graphs.TokenGraph, ret[0]) +# tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None) +# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None) +# cyto_data_base, weight_data, all_weights = graphs.convert_graph_to_cytoscape(tk_graph_filtered) + + +test = [1, 1, 1, 2, 2, 3, 3, 4, 4, 1000] +print(statistics.mean(test)) diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py index af61496..b282495 100644 --- a/src/lang_main/analysis/graphs.py +++ b/src/lang_main/analysis/graphs.py @@ -130,7 +130,7 @@ def convert_graph_to_cytoscape( cyto_data: list[CytoscapeData] = [] # iterate over nodes nodes = cast(Iterable[NodeTitle], graph.nodes) - for i, node in enumerate(nodes): + for node in nodes: node_data: CytoscapeData = { 'data': { 'id': node, @@ -151,7 +151,7 @@ def convert_graph_to_cytoscape( ], graph.edges.data('weight', default=1), # type: ignore ) - for i, (source, target, weight) in enumerate(edges): + for source, target, weight in edges: weights.add(weight) edge_data: CytoscapeData = { 'data': { @@ -288,27 +288,35 @@ class TokenGraph(DiGraph): def filter_by_edge_weight( self, - threshold: int, + bound_lower: int | None, + bound_upper: int | None, ) -> Self: - """filters all edges which are below the given threshold + """filters all edges which are within the provided bounds Parameters ---------- - threshold : int - edges with weights smaller than this value will be removed + bound_lower : int | None + lower bound for edge weights, edges with weight equal to this value are retained + bound_upper : int | None + upper bound for edge weights, edges with weight equal to this value are retained Returns ------- Self a copy of the graph with filtered edges """ - # filter edges by weight original_graph_edges = copy.deepcopy(self.edges) filtered_graph = self.copy() + if not any([bound_lower, bound_upper]): + logger.warning('No bounds provided, returning original graph.') + return filtered_graph + for edge in original_graph_edges: weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight']) - if weight < threshold: + if bound_lower is not None and weight < bound_lower: + filtered_graph.remove_edge(edge[0], edge[1]) + if bound_upper is not None and weight > bound_upper: filtered_graph.remove_edge(edge[0], edge[1]) if filtered_graph._undirected is not None: @@ -320,14 +328,17 @@ class TokenGraph(DiGraph): def filter_by_node_degree( self, - threshold: int, + bound_lower: int | None, + bound_upper: int | None, ) -> Self: - """filters all nodes which have a degree below the given threshold + """filters all nodes which are within the provided bounds by their degree Parameters ---------- - threshold : int - nodes with a degree smaller than this value will be removed + bound_lower : int | None + lower bound for node degree, nodes with degree equal to this value are retained + bound_upper : int | None + upper bound for node degree, nodes with degree equal to this value are retained Returns ------- @@ -338,9 +349,15 @@ class TokenGraph(DiGraph): original_graph_nodes = copy.deepcopy(self.nodes) filtered_graph = self.copy() + if not any([bound_lower, bound_upper]): + logger.warning('No bounds provided, returning original graph.') + return filtered_graph + for node in original_graph_nodes: degree = filtered_graph.degree[node] # type: ignore - if degree < threshold: + if bound_lower is not None and degree < bound_lower: + filtered_graph.remove_node(node) + if bound_upper is not None and degree > bound_upper: filtered_graph.remove_node(node) if filtered_graph._undirected is not None: diff --git a/src/lang_main/types.py b/src/lang_main/types.py index cafe29e..731a630 100644 --- a/src/lang_main/types.py +++ b/src/lang_main/types.py @@ -21,7 +21,7 @@ class STFRDeviceTypes(enum.StrEnum): GPU = 'cuda' -# ** datatsets +# ** datasets PandasIndex: TypeAlias = int | np.int64 ObjectID: TypeAlias = int Embedding: TypeAlias = SpacyDoc | Tensor