using dash-cytoscape
This commit is contained in:
parent
b3cc012791
commit
b3e35e7dd1
15
pdm.lock
generated
15
pdm.lock
generated
@ -5,7 +5,7 @@
|
|||||||
groups = ["default", "notebooks", "trials"]
|
groups = ["default", "notebooks", "trials"]
|
||||||
strategy = ["cross_platform", "inherit_metadata"]
|
strategy = ["cross_platform", "inherit_metadata"]
|
||||||
lock_version = "4.4.1"
|
lock_version = "4.4.1"
|
||||||
content_hash = "sha256:7574154c6728ede3eaf76a8b1a3b5d4339fcc8f2dc8c41042401004b6583e151"
|
content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "annotated-types"
|
name = "annotated-types"
|
||||||
@ -432,6 +432,19 @@ files = [
|
|||||||
{file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
|
{file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dash-cytoscape"
|
||||||
|
version = "1.0.1"
|
||||||
|
requires_python = ">=3.8"
|
||||||
|
summary = "A Component Library for Dash aimed at facilitating network visualization in Python, wrapped around Cytoscape.js"
|
||||||
|
groups = ["trials"]
|
||||||
|
dependencies = [
|
||||||
|
"dash",
|
||||||
|
]
|
||||||
|
files = [
|
||||||
|
{file = "dash_cytoscape-1.0.1.tar.gz", hash = "sha256:1bcd1587b2d8b432945585e2295e76393d3eb829f606c198693cd2b45bea6adc"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dash-html-components"
|
name = "dash-html-components"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
|
|||||||
@ -33,6 +33,7 @@ notebooks = [
|
|||||||
trials = [
|
trials = [
|
||||||
"plotly>=5.22.0",
|
"plotly>=5.22.0",
|
||||||
"dash>=2.17.0",
|
"dash>=2.17.0",
|
||||||
|
"dash-cytoscape>=1.0.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
|
|||||||
@ -28,6 +28,8 @@ from lang_main.pipelines.predefined import (
|
|||||||
)
|
)
|
||||||
from lang_main.types import (
|
from lang_main.types import (
|
||||||
ObjectID,
|
ObjectID,
|
||||||
|
PandasIndex,
|
||||||
|
SpacyDoc,
|
||||||
TimelineCandidates,
|
TimelineCandidates,
|
||||||
)
|
)
|
||||||
from pandas import DataFrame, Series
|
from pandas import DataFrame, Series
|
||||||
@ -37,7 +39,7 @@ from pandas import DataFrame, Series
|
|||||||
def run_preprocessing() -> DataFrame:
|
def run_preprocessing() -> DataFrame:
|
||||||
create_saving_folder(
|
create_saving_folder(
|
||||||
saving_path_folder=SAVE_PATH_FOLDER,
|
saving_path_folder=SAVE_PATH_FOLDER,
|
||||||
overwrite_existing=True,
|
overwrite_existing=False,
|
||||||
)
|
)
|
||||||
# run pipelines
|
# run pipelines
|
||||||
ret = typing.cast(
|
ret = typing.cast(
|
||||||
@ -56,15 +58,16 @@ def run_preprocessing() -> DataFrame:
|
|||||||
|
|
||||||
def run_token_analysis(
|
def run_token_analysis(
|
||||||
preprocessed_data: DataFrame,
|
preprocessed_data: DataFrame,
|
||||||
) -> TokenGraph:
|
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
|
||||||
# build token graph
|
# build token graph
|
||||||
(tk_graph,) = typing.cast(
|
(tk_graph, docs_mapping) = typing.cast(
|
||||||
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
|
tuple[TokenGraph, dict[PandasIndex, SpacyDoc]],
|
||||||
|
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
|
||||||
)
|
)
|
||||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
||||||
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||||
|
|
||||||
return tk_graph
|
return tk_graph, docs_mapping
|
||||||
|
|
||||||
|
|
||||||
def run_graph_postprocessing(
|
def run_graph_postprocessing(
|
||||||
@ -127,9 +130,9 @@ def main() -> None:
|
|||||||
'Preprocessing step skipped. Token analysis cannot be performed.'
|
'Preprocessing step skipped. Token analysis cannot be performed.'
|
||||||
)
|
)
|
||||||
preprocessed_data_trunc = typing.cast(
|
preprocessed_data_trunc = typing.cast(
|
||||||
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
|
DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
|
||||||
elif not SKIP_TOKEN_ANALYSIS:
|
elif not SKIP_TOKEN_ANALYSIS:
|
||||||
# !! hardcoded result filenames
|
# !! hardcoded result filenames
|
||||||
# whole graph
|
# whole graph
|
||||||
|
|||||||
@ -16,7 +16,6 @@ from dash import (
|
|||||||
dcc,
|
dcc,
|
||||||
html,
|
html,
|
||||||
)
|
)
|
||||||
from lang_main import CALLER_PATH
|
|
||||||
from lang_main.io import load_pickle
|
from lang_main.io import load_pickle
|
||||||
from lang_main.types import ObjectID, TimelineCandidates
|
from lang_main.types import ObjectID, TimelineCandidates
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
@ -24,12 +23,8 @@ from pandas import DataFrame
|
|||||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||||
|
|
||||||
# ** data
|
# ** data
|
||||||
# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
|
||||||
p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
|
||||||
# p_tl = Path(
|
|
||||||
# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
|
||||||
# )
|
|
||||||
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
|
|
||||||
ret = cast(DataFrame, load_pickle(p_df))
|
ret = cast(DataFrame, load_pickle(p_df))
|
||||||
data = ret[0]
|
data = ret[0]
|
||||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||||
@ -133,7 +128,7 @@ def update_timeline(index, obj_id):
|
|||||||
cands_obj_id = cands[obj_id]
|
cands_obj_id = cands[obj_id]
|
||||||
cands_choice = cands_obj_id[int(index) - 1]
|
cands_choice = cands_obj_id[int(index) - 1]
|
||||||
# data
|
# data
|
||||||
df = data.loc[list(cands_choice)].sort_index()
|
df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||||
# figure
|
# figure
|
||||||
fig = px.line(
|
fig = px.line(
|
||||||
data_frame=df,
|
data_frame=df,
|
||||||
@ -164,7 +159,7 @@ def update_table_candidates(index, obj_id):
|
|||||||
cands_obj_id = cands[obj_id]
|
cands_obj_id = cands[obj_id]
|
||||||
cands_choice = cands_obj_id[int(index) - 1]
|
cands_choice = cands_obj_id[int(index) - 1]
|
||||||
# data
|
# data
|
||||||
df = data.loc[list(cands_choice)].sort_index()
|
df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||||
df = df.filter(items=table_feats, axis=1).sort_values(
|
df = df.filter(items=table_feats, axis=1).sort_values(
|
||||||
by='ErstellungsDatum', ascending=True
|
by='ErstellungsDatum', ascending=True
|
||||||
)
|
)
|
||||||
203
scripts/dashboard/cyto.py
Normal file
203
scripts/dashboard/cyto.py
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
import time
|
||||||
|
import webbrowser
|
||||||
|
from pathlib import Path
|
||||||
|
from threading import Thread
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
import dash_cytoscape as cyto
|
||||||
|
import lang_main.io
|
||||||
|
from dash import Dash, Input, Output, State, dcc, html
|
||||||
|
from lang_main.analysis import graphs
|
||||||
|
|
||||||
|
target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
|
||||||
|
p = Path(target).resolve()
|
||||||
|
ret = lang_main.io.load_pickle(p)
|
||||||
|
tk_graph = cast(graphs.TokenGraph, ret[0])
|
||||||
|
tk_graph_filtered = tk_graph.filter_by_edge_weight(150)
|
||||||
|
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
||||||
|
cyto_data, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
|
||||||
|
|
||||||
|
MIN_WEIGHT = weight_data['min']
|
||||||
|
MAX_WEIGHT = weight_data['max']
|
||||||
|
|
||||||
|
|
||||||
|
cyto.load_extra_layouts()
|
||||||
|
app = Dash(__name__)
|
||||||
|
|
||||||
|
my_stylesheet = [
|
||||||
|
# Group selectors
|
||||||
|
{
|
||||||
|
'selector': 'node',
|
||||||
|
'style': {
|
||||||
|
'shape': 'circle',
|
||||||
|
'content': 'data(label)',
|
||||||
|
'background-color': '#B10DC9',
|
||||||
|
'border-width': 2,
|
||||||
|
'border-color': 'black',
|
||||||
|
'border-opacity': 1,
|
||||||
|
'opacity': 1,
|
||||||
|
'color': 'black',
|
||||||
|
'text-opacity': 1,
|
||||||
|
'font-size': 12,
|
||||||
|
'z-index': 9999,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'selector': 'edge',
|
||||||
|
'style': {
|
||||||
|
'width': 2,
|
||||||
|
'curve-style': 'bezier',
|
||||||
|
'line-color': 'grey',
|
||||||
|
'line-style': 'solid',
|
||||||
|
'line-opacity': 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# Class selectors
|
||||||
|
# {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
|
||||||
|
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
|
||||||
|
]
|
||||||
|
|
||||||
|
app.layout = html.Div(
|
||||||
|
[
|
||||||
|
html.Button('Reset', id='bt-reset'),
|
||||||
|
dcc.Dropdown(
|
||||||
|
id='layout_choice_internal',
|
||||||
|
options=[
|
||||||
|
'random',
|
||||||
|
'grid',
|
||||||
|
'circle',
|
||||||
|
'concentric',
|
||||||
|
'breadthfirst',
|
||||||
|
'cose',
|
||||||
|
],
|
||||||
|
value='cose',
|
||||||
|
clearable=False,
|
||||||
|
),
|
||||||
|
dcc.Dropdown(
|
||||||
|
id='layout_choice_external',
|
||||||
|
options=[
|
||||||
|
'cose-bilkent',
|
||||||
|
'cola',
|
||||||
|
'euler',
|
||||||
|
'spread',
|
||||||
|
'dagre',
|
||||||
|
'klay',
|
||||||
|
],
|
||||||
|
clearable=False,
|
||||||
|
),
|
||||||
|
dcc.RangeSlider(
|
||||||
|
id='weight_slider',
|
||||||
|
min=MIN_WEIGHT,
|
||||||
|
max=MAX_WEIGHT,
|
||||||
|
step=1000,
|
||||||
|
),
|
||||||
|
cyto.Cytoscape(
|
||||||
|
id='cytoscape-graph',
|
||||||
|
layout={'name': 'cose'},
|
||||||
|
style={'width': '100%', 'height': '600px'},
|
||||||
|
stylesheet=my_stylesheet,
|
||||||
|
elements=cyto_data,
|
||||||
|
zoom=1,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
|
Input('layout_choice_internal', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_layout_internal(layout_choice):
|
||||||
|
return {'name': layout_choice}
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
|
Input('layout_choice_external', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_layout_external(layout_choice):
|
||||||
|
return {'name': layout_choice}
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'zoom'),
|
||||||
|
Output('cytoscape-graph', 'elements'),
|
||||||
|
Input('bt-reset', 'n_clicks'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def reset_layout(n_clicks):
|
||||||
|
return (1, cyto_data)
|
||||||
|
|
||||||
|
|
||||||
|
# @app.callback(
|
||||||
|
# Output('cytoscape-graph', 'stylesheet'),
|
||||||
|
# Input('weight_slider', 'value'),
|
||||||
|
# State('cytoscape-graph', 'stylesheet'),
|
||||||
|
# prevent_initial_call=True,
|
||||||
|
# )
|
||||||
|
# def select_weight(range_chosen, stylesheet):
|
||||||
|
# min_weight, max_weight = range_chosen
|
||||||
|
# new_stylesheet = stylesheet.copy()
|
||||||
|
# new_stylesheet.append(
|
||||||
|
# {
|
||||||
|
# 'selector': f'[weight >= {min_weight}]',
|
||||||
|
# 'style': {'line-color': 'blue', 'line-style': 'dashed'},
|
||||||
|
# }
|
||||||
|
# )
|
||||||
|
# new_stylesheet.append(
|
||||||
|
# {
|
||||||
|
# 'selector': f'[weight <= {max_weight}]',
|
||||||
|
# 'style': {'line-color': 'blue', 'line-style': 'dashed'},
|
||||||
|
# }
|
||||||
|
# )
|
||||||
|
# return new_stylesheet
|
||||||
|
|
||||||
|
|
||||||
|
# app.layout = html.Div(
|
||||||
|
# [
|
||||||
|
# cyto.Cytoscape(
|
||||||
|
# id='cytoscape-two-nodes',
|
||||||
|
# layout={'name': 'preset'},
|
||||||
|
# style={'width': '100%', 'height': '400px'},
|
||||||
|
# stylesheet=my_stylesheet,
|
||||||
|
# elements=[
|
||||||
|
# {
|
||||||
|
# 'data': {
|
||||||
|
# 'id': 'one',
|
||||||
|
# 'label': 'Titel 1',
|
||||||
|
# },
|
||||||
|
# 'position': {'x': 75, 'y': 75},
|
||||||
|
# 'grabbable': False,
|
||||||
|
# #'locked': True,
|
||||||
|
# 'classes': 'red',
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# 'data': {'id': 'two', 'label': 'Title 2'},
|
||||||
|
# 'position': {'x': 200, 'y': 200},
|
||||||
|
# 'classes': 'triangle',
|
||||||
|
# },
|
||||||
|
# {'data': {'source': 'one', 'target': 'two', 'weight': 2000}},
|
||||||
|
# ],
|
||||||
|
# )
|
||||||
|
# ]
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
def _start_webbrowser():
|
||||||
|
host = '127.0.0.1'
|
||||||
|
port = '8050'
|
||||||
|
adress = f'http://{host}:{port}/'
|
||||||
|
time.sleep(2)
|
||||||
|
webbrowser.open_new(adress)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
|
||||||
|
webbrowser_thread.start()
|
||||||
|
app.run(debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
368
scripts/dashboard/cyto_2.py
Normal file
368
scripts/dashboard/cyto_2.py
Normal file
@ -0,0 +1,368 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import dash
|
||||||
|
import dash_cytoscape as cyto
|
||||||
|
from dash import Input, Output, State, callback, dcc, html
|
||||||
|
|
||||||
|
# Load extra layouts
|
||||||
|
cyto.load_extra_layouts()
|
||||||
|
|
||||||
|
|
||||||
|
# Display utility functions
|
||||||
|
def _merge(a, b):
|
||||||
|
return dict(a, **b)
|
||||||
|
|
||||||
|
|
||||||
|
def _omit(omitted_keys, d):
|
||||||
|
return {k: v for k, v in d.items() if k not in omitted_keys}
|
||||||
|
|
||||||
|
|
||||||
|
# Custom Display Components
|
||||||
|
def Card(children, **kwargs):
|
||||||
|
return html.Section(
|
||||||
|
children,
|
||||||
|
style=_merge(
|
||||||
|
{
|
||||||
|
'padding': 20,
|
||||||
|
'margin': 5,
|
||||||
|
'borderRadius': 5,
|
||||||
|
'border': 'thin lightgrey solid',
|
||||||
|
'background-color': 'white',
|
||||||
|
# Remove possibility to select the text for better UX
|
||||||
|
'user-select': 'none',
|
||||||
|
'-moz-user-select': 'none',
|
||||||
|
'-webkit-user-select': 'none',
|
||||||
|
'-ms-user-select': 'none',
|
||||||
|
},
|
||||||
|
kwargs.get('style', {}),
|
||||||
|
),
|
||||||
|
**_omit(['style'], kwargs),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def SectionTitle(title, size, align='center', color='#222'):
|
||||||
|
return html.Div(
|
||||||
|
style={'text-align': align, 'color': color},
|
||||||
|
children=dcc.Markdown('#' * size + ' ' + title),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def NamedCard(title, size, children, **kwargs):
|
||||||
|
size = min(size, 6)
|
||||||
|
size = max(size, 1)
|
||||||
|
|
||||||
|
return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
|
||||||
|
|
||||||
|
|
||||||
|
def NamedSlider(name, **kwargs):
|
||||||
|
return html.Div(
|
||||||
|
style={'padding': '20px 10px 25px 4px'},
|
||||||
|
children=[
|
||||||
|
html.P(f'{name}:'),
|
||||||
|
html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def NamedDropdown(name, **kwargs):
|
||||||
|
return html.Div(
|
||||||
|
style={'margin': '10px 0px'},
|
||||||
|
children=[
|
||||||
|
html.P(children=f'{name}:', style={'margin-left': '3px'}),
|
||||||
|
dcc.Dropdown(**kwargs),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def NamedRadioItems(name, **kwargs):
|
||||||
|
return html.Div(
|
||||||
|
style={'padding': '20px 10px 25px 4px'},
|
||||||
|
children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def NamedInput(name, **kwargs):
|
||||||
|
return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
|
||||||
|
|
||||||
|
|
||||||
|
# Utils
|
||||||
|
def DropdownOptionsList(*args):
|
||||||
|
return [{'label': val.capitalize(), 'value': val} for val in args]
|
||||||
|
|
||||||
|
|
||||||
|
asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
|
||||||
|
|
||||||
|
app = dash.Dash(__name__, assets_folder=asset_path)
|
||||||
|
server = app.server
|
||||||
|
|
||||||
|
|
||||||
|
# ###################### DATA PREPROCESSING ######################
|
||||||
|
# Load data
|
||||||
|
with open('sample_network.txt', 'r', encoding='utf-8') as f:
|
||||||
|
network_data = f.read().split('\n')
|
||||||
|
|
||||||
|
# We select the first 750 edges and associated nodes for an easier visualization
|
||||||
|
edges = network_data[:750]
|
||||||
|
nodes = set()
|
||||||
|
|
||||||
|
following_node_di = {} # user id -> list of users they are following
|
||||||
|
following_edges_di = {} # user id -> list of cy edges starting from user id
|
||||||
|
|
||||||
|
followers_node_di = {} # user id -> list of followers (cy_node format)
|
||||||
|
followers_edges_di = {} # user id -> list of cy edges ending at user id
|
||||||
|
|
||||||
|
cy_edges = []
|
||||||
|
cy_nodes = []
|
||||||
|
|
||||||
|
for edge in edges:
|
||||||
|
if ' ' not in edge:
|
||||||
|
continue
|
||||||
|
|
||||||
|
source, target = edge.split(' ')
|
||||||
|
|
||||||
|
cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
|
||||||
|
cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
|
||||||
|
cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
|
||||||
|
|
||||||
|
if source not in nodes:
|
||||||
|
nodes.add(source)
|
||||||
|
cy_nodes.append(cy_source)
|
||||||
|
if target not in nodes:
|
||||||
|
nodes.add(target)
|
||||||
|
cy_nodes.append(cy_target)
|
||||||
|
|
||||||
|
# Process dictionary of following
|
||||||
|
if not following_node_di.get(source):
|
||||||
|
following_node_di[source] = []
|
||||||
|
if not following_edges_di.get(source):
|
||||||
|
following_edges_di[source] = []
|
||||||
|
|
||||||
|
following_node_di[source].append(cy_target)
|
||||||
|
following_edges_di[source].append(cy_edge)
|
||||||
|
|
||||||
|
# Process dictionary of followers
|
||||||
|
if not followers_node_di.get(target):
|
||||||
|
followers_node_di[target] = []
|
||||||
|
if not followers_edges_di.get(target):
|
||||||
|
followers_edges_di[target] = []
|
||||||
|
|
||||||
|
followers_node_di[target].append(cy_source)
|
||||||
|
followers_edges_di[target].append(cy_edge)
|
||||||
|
|
||||||
|
genesis_node = cy_nodes[0]
|
||||||
|
genesis_node['classes'] = 'genesis'
|
||||||
|
default_elements = [genesis_node]
|
||||||
|
|
||||||
|
default_stylesheet = [
|
||||||
|
{'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
|
||||||
|
{
|
||||||
|
'selector': 'edge',
|
||||||
|
'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
|
||||||
|
},
|
||||||
|
{'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
|
||||||
|
{
|
||||||
|
'selector': '.followerEdge',
|
||||||
|
'style': {
|
||||||
|
'mid-target-arrow-color': 'blue',
|
||||||
|
'mid-target-arrow-shape': 'vee',
|
||||||
|
'line-color': '#0074D9',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
|
||||||
|
{
|
||||||
|
'selector': '.followingEdge',
|
||||||
|
'style': {
|
||||||
|
'mid-target-arrow-color': 'red',
|
||||||
|
'mid-target-arrow-shape': 'vee',
|
||||||
|
'line-color': '#FF4136',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'selector': '.genesis',
|
||||||
|
'style': {
|
||||||
|
'background-color': '#B10DC9',
|
||||||
|
'border-width': 2,
|
||||||
|
'border-color': 'purple',
|
||||||
|
'border-opacity': 1,
|
||||||
|
'opacity': 1,
|
||||||
|
'label': 'data(label)',
|
||||||
|
'color': '#B10DC9',
|
||||||
|
'text-opacity': 1,
|
||||||
|
'font-size': 12,
|
||||||
|
'z-index': 9999,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'selector': ':selected',
|
||||||
|
'style': {
|
||||||
|
'border-width': 2,
|
||||||
|
'border-color': 'black',
|
||||||
|
'border-opacity': 1,
|
||||||
|
'opacity': 1,
|
||||||
|
'label': 'data(label)',
|
||||||
|
'color': 'black',
|
||||||
|
'font-size': 12,
|
||||||
|
'z-index': 9999,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# ################################# APP LAYOUT ################################
|
||||||
|
styles = {
|
||||||
|
'json-output': {
|
||||||
|
'overflow-y': 'scroll',
|
||||||
|
'height': 'calc(50% - 25px)',
|
||||||
|
'border': 'thin lightgrey solid',
|
||||||
|
},
|
||||||
|
'tab': {'height': 'calc(98vh - 80px)'},
|
||||||
|
}
|
||||||
|
|
||||||
|
app.layout = html.Div(
|
||||||
|
[
|
||||||
|
html.Div(
|
||||||
|
className='eight columns',
|
||||||
|
children=[
|
||||||
|
cyto.Cytoscape(
|
||||||
|
id='cytoscape',
|
||||||
|
elements=default_elements,
|
||||||
|
stylesheet=default_stylesheet,
|
||||||
|
style={'height': '95vh', 'width': '100%'},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
className='four columns',
|
||||||
|
children=[
|
||||||
|
dcc.Tabs(
|
||||||
|
id='tabs',
|
||||||
|
children=[
|
||||||
|
dcc.Tab(
|
||||||
|
label='Control Panel',
|
||||||
|
children=[
|
||||||
|
NamedDropdown(
|
||||||
|
name='Layout',
|
||||||
|
id='dropdown-layout',
|
||||||
|
options=DropdownOptionsList(
|
||||||
|
'random',
|
||||||
|
'grid',
|
||||||
|
'circle',
|
||||||
|
'concentric',
|
||||||
|
'breadthfirst',
|
||||||
|
'cose',
|
||||||
|
'cose-bilkent',
|
||||||
|
'dagre',
|
||||||
|
'cola',
|
||||||
|
'klay',
|
||||||
|
'spread',
|
||||||
|
'euler',
|
||||||
|
),
|
||||||
|
value='grid',
|
||||||
|
clearable=False,
|
||||||
|
),
|
||||||
|
NamedRadioItems(
|
||||||
|
name='Expand',
|
||||||
|
id='radio-expand',
|
||||||
|
options=DropdownOptionsList('followers', 'following'),
|
||||||
|
value='followers',
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
dcc.Tab(
|
||||||
|
label='JSON',
|
||||||
|
children=[
|
||||||
|
html.Div(
|
||||||
|
style=styles['tab'],
|
||||||
|
children=[
|
||||||
|
html.P('Node Object JSON:'),
|
||||||
|
html.Pre(
|
||||||
|
id='tap-node-json-output',
|
||||||
|
style=styles['json-output'],
|
||||||
|
),
|
||||||
|
html.P('Edge Object JSON:'),
|
||||||
|
html.Pre(
|
||||||
|
id='tap-edge-json-output',
|
||||||
|
style=styles['json-output'],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ############################## CALLBACKS ####################################
|
||||||
|
@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
|
||||||
|
def display_tap_node(data):
|
||||||
|
return json.dumps(data, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
|
||||||
|
def display_tap_edge(data):
|
||||||
|
return json.dumps(data, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
|
||||||
|
def update_cytoscape_layout(layout):
|
||||||
|
return {'name': layout}
|
||||||
|
|
||||||
|
|
||||||
|
@callback(
|
||||||
|
Output('cytoscape', 'elements'),
|
||||||
|
Input('cytoscape', 'tapNodeData'),
|
||||||
|
State('cytoscape', 'elements'),
|
||||||
|
State('radio-expand', 'value'),
|
||||||
|
)
|
||||||
|
def generate_elements(nodeData, elements, expansion_mode):
|
||||||
|
if not nodeData:
|
||||||
|
return default_elements
|
||||||
|
|
||||||
|
# If the node has already been expanded, we don't expand it again
|
||||||
|
if nodeData.get('expanded'):
|
||||||
|
return elements
|
||||||
|
|
||||||
|
# This retrieves the currently selected element, and tag it as expanded
|
||||||
|
for element in elements:
|
||||||
|
if nodeData['id'] == element.get('data').get('id'):
|
||||||
|
element['data']['expanded'] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if expansion_mode == 'followers':
|
||||||
|
followers_nodes = followers_node_di.get(nodeData['id'])
|
||||||
|
followers_edges = followers_edges_di.get(nodeData['id'])
|
||||||
|
|
||||||
|
if followers_nodes:
|
||||||
|
for node in followers_nodes:
|
||||||
|
node['classes'] = 'followerNode'
|
||||||
|
elements.extend(followers_nodes)
|
||||||
|
|
||||||
|
if followers_edges:
|
||||||
|
for follower_edge in followers_edges:
|
||||||
|
follower_edge['classes'] = 'followerEdge'
|
||||||
|
elements.extend(followers_edges)
|
||||||
|
|
||||||
|
elif expansion_mode == 'following':
|
||||||
|
following_nodes = following_node_di.get(nodeData['id'])
|
||||||
|
following_edges = following_edges_di.get(nodeData['id'])
|
||||||
|
|
||||||
|
if following_nodes:
|
||||||
|
for node in following_nodes:
|
||||||
|
if node['data']['id'] != genesis_node['data']['id']:
|
||||||
|
node['classes'] = 'followingNode'
|
||||||
|
elements.append(node)
|
||||||
|
|
||||||
|
if following_edges:
|
||||||
|
for follower_edge in following_edges:
|
||||||
|
follower_edge['classes'] = 'followingEdge'
|
||||||
|
elements.extend(following_edges)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run_server(debug=True)
|
||||||
10297
scripts/dashboard/sample_network.txt
Normal file
10297
scripts/dashboard/sample_network.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -10,14 +10,14 @@ dataset = '../data/02_202307/Export4.csv'
|
|||||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
[control]
|
[control]
|
||||||
preprocessing = true
|
preprocessing = false
|
||||||
preprocessing_skip = true
|
preprocessing_skip = false
|
||||||
token_analysis = false
|
token_analysis = true
|
||||||
token_analysis_skip = true
|
token_analysis_skip = false
|
||||||
graph_postprocessing = false
|
graph_postprocessing = false
|
||||||
graph_postprocessing_skip = true
|
graph_postprocessing_skip = true
|
||||||
time_analysis = true
|
time_analysis = false
|
||||||
time_analysis_skip = false
|
time_analysis_skip = true
|
||||||
|
|
||||||
#[export_filenames]
|
#[export_filenames]
|
||||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|||||||
@ -1,9 +1,15 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from lang_main.constants import (
|
from lang_main.constants import (
|
||||||
INPUT_PATH_FOLDER,
|
INPUT_PATH_FOLDER,
|
||||||
PATH_TO_DATASET,
|
PATH_TO_DATASET,
|
||||||
SAVE_PATH_FOLDER,
|
SAVE_PATH_FOLDER,
|
||||||
|
input_path_conf,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(SAVE_PATH_FOLDER, '\n')
|
print(SAVE_PATH_FOLDER, '\n')
|
||||||
print(INPUT_PATH_FOLDER, '\n')
|
print(INPUT_PATH_FOLDER, '\n')
|
||||||
print(PATH_TO_DATASET, '\n')
|
print(PATH_TO_DATASET, '\n')
|
||||||
|
|
||||||
|
print('------------------------')
|
||||||
|
print(Path.cwd(), '\n', input_path_conf)
|
||||||
51
src/lang_main/__init__ copy.py
Normal file
51
src/lang_main/__init__ copy.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
import inspect
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from time import gmtime
|
||||||
|
from typing import Any, Final
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from lang_main.io import load_toml_config
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'CALLER_PATH',
|
||||||
|
]
|
||||||
|
|
||||||
|
logging.Formatter.converter = gmtime
|
||||||
|
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
||||||
|
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||||
|
logging.basicConfig(
|
||||||
|
stream=sys.stdout,
|
||||||
|
format=LOG_FMT,
|
||||||
|
datefmt=LOG_DATE_FMT,
|
||||||
|
)
|
||||||
|
|
||||||
|
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||||
|
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||||
|
pkg_dir = Path(__file__).parent
|
||||||
|
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||||
|
caller_file = Path(inspect.stack()[-1].filename)
|
||||||
|
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
|
||||||
|
|
||||||
|
# load config data: internal/external
|
||||||
|
if USE_INTERNAL_CONFIG:
|
||||||
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||||
|
else:
|
||||||
|
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
|
||||||
|
if not caller_file.exists():
|
||||||
|
warnings.warn('Caller file could not be correctly retrieved.')
|
||||||
|
if not cfg_path_external.exists():
|
||||||
|
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||||
|
sys.exit(
|
||||||
|
(
|
||||||
|
'No config file was found. A new one with default values was created '
|
||||||
|
'in the execution path. Please fill in the necessary values and '
|
||||||
|
'restart the programm.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# raise NotImplementedError("External config data not implemented yet.")
|
||||||
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||||
|
|
||||||
|
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||||
@ -1,4 +1,3 @@
|
|||||||
import inspect
|
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
@ -8,10 +7,6 @@ from typing import Any, Final
|
|||||||
|
|
||||||
from lang_main.io import load_toml_config
|
from lang_main.io import load_toml_config
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'CALLER_PATH',
|
|
||||||
]
|
|
||||||
|
|
||||||
logging.Formatter.converter = gmtime
|
logging.Formatter.converter = gmtime
|
||||||
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
||||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||||
@ -24,17 +19,15 @@ logging.basicConfig(
|
|||||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||||
pkg_dir = Path(__file__).parent
|
pkg_dir = Path(__file__).parent
|
||||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
|
||||||
caller_file = Path(inspect.stack()[-1].filename)
|
# caller_file = Path(inspect.stack()[-1].filename)
|
||||||
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
|
# CALLER_PATH: Final[Path] = caller_file.parent.resolve()
|
||||||
|
|
||||||
# load config data: internal/external
|
# load config data: internal/external
|
||||||
if USE_INTERNAL_CONFIG:
|
if USE_INTERNAL_CONFIG:
|
||||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||||
else:
|
else:
|
||||||
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
|
cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
|
||||||
if not caller_file.exists():
|
|
||||||
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
|
||||||
if not cfg_path_external.exists():
|
if not cfg_path_external.exists():
|
||||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||||
sys.exit(
|
sys.exit(
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import sys
|
|||||||
import typing
|
import typing
|
||||||
from collections.abc import Hashable, Iterable
|
from collections.abc import Hashable, Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Final, Literal, Self, overload
|
from typing import Any, Final, Literal, Self, cast, overload
|
||||||
|
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -13,6 +13,12 @@ from pandas import DataFrame
|
|||||||
|
|
||||||
from lang_main.io import load_pickle, save_pickle
|
from lang_main.io import load_pickle, save_pickle
|
||||||
from lang_main.loggers import logger_graphs as logger
|
from lang_main.loggers import logger_graphs as logger
|
||||||
|
from lang_main.types import (
|
||||||
|
CytoscapeData,
|
||||||
|
EdgeWeight,
|
||||||
|
NodeTitle,
|
||||||
|
WeightData,
|
||||||
|
)
|
||||||
|
|
||||||
# TODO change logging behaviour, add logging to file
|
# TODO change logging behaviour, add logging to file
|
||||||
LOGGING_DEFAULT: Final[bool] = False
|
LOGGING_DEFAULT: Final[bool] = False
|
||||||
@ -67,7 +73,7 @@ def update_graph(
|
|||||||
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
|
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
|
||||||
parent: Hashable | None = None,
|
parent: Hashable | None = None,
|
||||||
child: Hashable | None = None,
|
child: Hashable | None = None,
|
||||||
weight_connection: int = 1,
|
weight_connection: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
# !! not necessary to check for existence of nodes
|
# !! not necessary to check for existence of nodes
|
||||||
# !! feature already implemented in NetworkX ``add_edge``
|
# !! feature already implemented in NetworkX ``add_edge``
|
||||||
@ -78,6 +84,8 @@ def update_graph(
|
|||||||
if child not in graph:
|
if child not in graph:
|
||||||
graph.add_node(child)
|
graph.add_node(child)
|
||||||
"""
|
"""
|
||||||
|
if weight_connection is None:
|
||||||
|
weight_connection = 1
|
||||||
# check if edge not in Graph
|
# check if edge not in Graph
|
||||||
if batch is not None:
|
if batch is not None:
|
||||||
graph.add_edges_from(batch, weight=weight_connection)
|
graph.add_edges_from(batch, weight=weight_connection)
|
||||||
@ -116,6 +124,51 @@ def convert_graph_to_undirected(
|
|||||||
return graph_undir
|
return graph_undir
|
||||||
|
|
||||||
|
|
||||||
|
def convert_graph_to_cytoscape(
|
||||||
|
graph: Graph | DiGraph,
|
||||||
|
) -> tuple[list[CytoscapeData], WeightData]:
|
||||||
|
cyto_data: list[CytoscapeData] = []
|
||||||
|
# iterate over nodes
|
||||||
|
nodes = cast(Iterable[NodeTitle], graph.nodes)
|
||||||
|
for i, node in enumerate(nodes):
|
||||||
|
node_data: CytoscapeData = {
|
||||||
|
'data': {
|
||||||
|
'id': node,
|
||||||
|
'label': node,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cyto_data.append(node_data)
|
||||||
|
# iterate over edges
|
||||||
|
weights: set[int] = set()
|
||||||
|
|
||||||
|
edges = cast(
|
||||||
|
Iterable[
|
||||||
|
tuple[
|
||||||
|
NodeTitle,
|
||||||
|
NodeTitle,
|
||||||
|
EdgeWeight,
|
||||||
|
]
|
||||||
|
],
|
||||||
|
graph.edges.data('weight', default=1), # type: ignore
|
||||||
|
)
|
||||||
|
for i, (source, target, weight) in enumerate(edges):
|
||||||
|
weights.add(weight)
|
||||||
|
edge_data: CytoscapeData = {
|
||||||
|
'data': {
|
||||||
|
'source': source,
|
||||||
|
'target': target,
|
||||||
|
'weight': weight,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cyto_data.append(edge_data)
|
||||||
|
|
||||||
|
min_weight = min(weights)
|
||||||
|
max_weight = max(weights)
|
||||||
|
weight_metadata: WeightData = {'min': min_weight, 'max': max_weight}
|
||||||
|
|
||||||
|
return cyto_data, weight_metadata
|
||||||
|
|
||||||
|
|
||||||
class TokenGraph(DiGraph):
|
class TokenGraph(DiGraph):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -200,7 +253,9 @@ class TokenGraph(DiGraph):
|
|||||||
|
|
||||||
@overload
|
@overload
|
||||||
def to_undirected(
|
def to_undirected(
|
||||||
self, inplace: bool = ..., logging: bool | None = ...
|
self,
|
||||||
|
inplace: bool = ...,
|
||||||
|
logging: bool | None = ...,
|
||||||
) -> Graph | None: ...
|
) -> Graph | None: ...
|
||||||
|
|
||||||
def to_undirected(
|
def to_undirected(
|
||||||
|
|||||||
@ -214,20 +214,23 @@ def analyse_feature(
|
|||||||
unique_feature_entries = feature_entries.unique()
|
unique_feature_entries = feature_entries.unique()
|
||||||
|
|
||||||
# prepare result DataFrame
|
# prepare result DataFrame
|
||||||
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
cols = ['batched_idxs', 'entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
||||||
result_df = pd.DataFrame(columns=cols)
|
result_df = pd.DataFrame(columns=cols)
|
||||||
|
|
||||||
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
||||||
len_entry = len(entry)
|
len_entry = len(entry)
|
||||||
filt = data[target_feature] == entry
|
filt = data[target_feature] == entry
|
||||||
temp = data[filt]
|
temp = data[filt]
|
||||||
|
batched_idxs = temp.index.to_numpy()
|
||||||
assoc_obj_ids = temp['ObjektID'].unique()
|
assoc_obj_ids = temp['ObjektID'].unique()
|
||||||
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
|
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
|
||||||
num_assoc_obj_ids = len(assoc_obj_ids)
|
num_assoc_obj_ids = len(assoc_obj_ids)
|
||||||
num_dupl = filt.sum()
|
num_dupl = filt.sum()
|
||||||
|
|
||||||
conc_df = pd.DataFrame(
|
conc_df = pd.DataFrame(
|
||||||
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
|
data=[
|
||||||
|
[batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
|
||||||
|
],
|
||||||
columns=cols,
|
columns=cols,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from networkx import Graph
|
|||||||
from pandas import Series
|
from pandas import Series
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from tqdm.auto import tqdm
|
|
||||||
|
|
||||||
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||||
from lang_main.types import PandasIndex
|
from lang_main.types import PandasIndex
|
||||||
@ -40,9 +39,8 @@ def candidates_by_index(
|
|||||||
|
|
||||||
Yields
|
Yields
|
||||||
------
|
------
|
||||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
Iterator[tuple[PandasIndex, PandasIndex]]
|
||||||
ObjectID and tuple of index pairs which meet the cosine
|
tuple of index pairs which meet the cosine similarity threshold
|
||||||
similarity threshold
|
|
||||||
"""
|
"""
|
||||||
# embeddings
|
# embeddings
|
||||||
batch = cast(list[str], data_model_input.to_list())
|
batch = cast(list[str], data_model_input.to_list())
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
import re
|
import re
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
from typing import cast
|
from typing import Literal, cast, overload
|
||||||
|
|
||||||
from dateutil.parser import parse
|
from dateutil.parser import parse
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
from spacy.lang.de import German as GermanSpacyModel
|
from spacy.language import Language as GermanSpacyModel
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
from spacy.tokens.token import Token as SpacyToken
|
from spacy.tokens.token import Token as SpacyToken
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
@ -15,6 +15,7 @@ from lang_main.analysis.graphs import (
|
|||||||
update_graph,
|
update_graph,
|
||||||
)
|
)
|
||||||
from lang_main.loggers import logger_token_analysis as logger
|
from lang_main.loggers import logger_token_analysis as logger
|
||||||
|
from lang_main.types import PandasIndex
|
||||||
|
|
||||||
# ** POS
|
# ** POS
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||||
@ -104,7 +105,7 @@ def obtain_relevant_descendants(
|
|||||||
def add_doc_info_to_graph(
|
def add_doc_info_to_graph(
|
||||||
graph: TokenGraph,
|
graph: TokenGraph,
|
||||||
doc: SpacyDoc,
|
doc: SpacyDoc,
|
||||||
weight: int,
|
weight: int | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
# iterate over sentences
|
# iterate over sentences
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
@ -142,9 +143,121 @@ def add_doc_info_to_graph(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
def build_token_graph(
|
def build_token_graph(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
model: GermanSpacyModel,
|
model: GermanSpacyModel,
|
||||||
|
*,
|
||||||
|
target_feature: str = ...,
|
||||||
|
weights_feature: str | None = ...,
|
||||||
|
batch_idx_feature: str = ...,
|
||||||
|
build_map: Literal[False],
|
||||||
|
batch_size_model: int = ...,
|
||||||
|
) -> tuple[TokenGraph, None]: ...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def build_token_graph(
|
||||||
|
data: DataFrame,
|
||||||
|
model: GermanSpacyModel,
|
||||||
|
*,
|
||||||
|
target_feature: str = ...,
|
||||||
|
weights_feature: str | None = ...,
|
||||||
|
batch_idx_feature: str = ...,
|
||||||
|
build_map: Literal[True] = ...,
|
||||||
|
batch_size_model: int = ...,
|
||||||
|
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: ...
|
||||||
|
|
||||||
|
|
||||||
|
def build_token_graph(
|
||||||
|
data: DataFrame,
|
||||||
|
model: GermanSpacyModel,
|
||||||
|
*,
|
||||||
|
target_feature: str = 'entry',
|
||||||
|
weights_feature: str | None = None,
|
||||||
|
batch_idx_feature: str = 'batched_idxs',
|
||||||
|
build_map: bool = True,
|
||||||
|
batch_size_model: int = 50,
|
||||||
|
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
|
||||||
|
graph = TokenGraph()
|
||||||
|
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
|
||||||
|
if weights_feature is not None:
|
||||||
|
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
|
||||||
|
else:
|
||||||
|
weights = None
|
||||||
|
|
||||||
|
docs_mapping: dict[PandasIndex, SpacyDoc] | None
|
||||||
|
if build_map:
|
||||||
|
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
|
||||||
|
docs_mapping = {}
|
||||||
|
else:
|
||||||
|
indices = None
|
||||||
|
docs_mapping = None
|
||||||
|
|
||||||
|
index: int = 0
|
||||||
|
|
||||||
|
for doc in tqdm(
|
||||||
|
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
|
||||||
|
):
|
||||||
|
if weights is not None:
|
||||||
|
weight = weights[index]
|
||||||
|
else:
|
||||||
|
weight = None
|
||||||
|
add_doc_info_to_graph(
|
||||||
|
graph=graph,
|
||||||
|
doc=doc,
|
||||||
|
weight=weight,
|
||||||
|
)
|
||||||
|
# build map if option chosen
|
||||||
|
if indices is not None and docs_mapping is not None:
|
||||||
|
corresponding_indices = indices[index]
|
||||||
|
for idx in corresponding_indices:
|
||||||
|
docs_mapping[idx] = doc
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
# metadata
|
||||||
|
graph.update_metadata()
|
||||||
|
# convert to undirected
|
||||||
|
graph.to_undirected()
|
||||||
|
|
||||||
|
return graph, docs_mapping
|
||||||
|
|
||||||
|
|
||||||
|
def build_token_graph_simple(
|
||||||
|
data: DataFrame,
|
||||||
|
model: GermanSpacyModel,
|
||||||
|
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
|
||||||
|
graph = TokenGraph()
|
||||||
|
model_input = cast(tuple[str], tuple(data['entry'].to_list()))
|
||||||
|
weights = cast(tuple[int], tuple(data['num_occur'].to_list()))
|
||||||
|
indices = cast(tuple[list[PandasIndex]], tuple(data['batched_idxs'].to_list()))
|
||||||
|
index: int = 0
|
||||||
|
docs_mapping: dict[PandasIndex, SpacyDoc] = {}
|
||||||
|
|
||||||
|
for doc in tqdm(model.pipe(model_input, batch_size=50), total=len(model_input)):
|
||||||
|
add_doc_info_to_graph(
|
||||||
|
graph=graph,
|
||||||
|
doc=doc,
|
||||||
|
weight=weights[index],
|
||||||
|
)
|
||||||
|
corresponding_indices = indices[index]
|
||||||
|
for idx in corresponding_indices:
|
||||||
|
docs_mapping[idx] = doc
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
# metadata
|
||||||
|
graph.update_metadata()
|
||||||
|
# convert to undirected
|
||||||
|
graph.to_undirected()
|
||||||
|
|
||||||
|
return graph, docs_mapping
|
||||||
|
|
||||||
|
|
||||||
|
def build_token_graph_old(
|
||||||
|
data: DataFrame,
|
||||||
|
model: GermanSpacyModel,
|
||||||
) -> tuple[TokenGraph]:
|
) -> tuple[TokenGraph]:
|
||||||
# empty NetworkX directed graph
|
# empty NetworkX directed graph
|
||||||
# graph = nx.DiGraph()
|
# graph = nx.DiGraph()
|
||||||
|
|||||||
@ -1,15 +1,28 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
from lang_main import CALLER_PATH, CONFIG
|
import spacy
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from spacy.language import Language as GermanSpacyModel
|
||||||
|
|
||||||
|
from lang_main import CONFIG
|
||||||
|
from lang_main.types import STFRDeviceTypes
|
||||||
|
|
||||||
# ** paths
|
# ** paths
|
||||||
input_path_conf = Path(CONFIG['paths']['inputs'])
|
input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs'])
|
||||||
INPUT_PATH_FOLDER: Final[Path] = (CALLER_PATH / input_path_conf).resolve()
|
INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve()
|
||||||
save_path_conf = Path(CONFIG['paths']['results'])
|
# INPUT_PATH_FOLDER: Final[Path] = (CALLER_PATH / input_path_conf).resolve()
|
||||||
SAVE_PATH_FOLDER: Final[Path] = (CALLER_PATH / save_path_conf).resolve()
|
# TODO reactivate later
|
||||||
path_dataset_conf = Path(CONFIG['paths']['dataset'])
|
# if not INPUT_PATH_FOLDER.exists():
|
||||||
PATH_TO_DATASET: Final[Path] = (CALLER_PATH / path_dataset_conf).resolve()
|
# raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
|
||||||
|
save_path_conf = Path.cwd() / Path(CONFIG['paths']['results'])
|
||||||
|
SAVE_PATH_FOLDER: Final[Path] = save_path_conf.resolve()
|
||||||
|
# SAVE_PATH_FOLDER: Final[Path] = (CALLER_PATH / save_path_conf).resolve()
|
||||||
|
path_dataset_conf = Path.cwd() / Path(CONFIG['paths']['dataset'])
|
||||||
|
PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
|
||||||
|
# PATH_TO_DATASET: Final[Path] = (CALLER_PATH / path_dataset_conf).resolve()
|
||||||
|
# if not PATH_TO_DATASET.exists():
|
||||||
|
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
|
||||||
# ** control
|
# ** control
|
||||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||||
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||||
@ -19,8 +32,18 @@ DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
|||||||
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
||||||
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
||||||
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||||
# ** export
|
|
||||||
|
|
||||||
|
# ** models
|
||||||
|
# ** sentence_transformers
|
||||||
|
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
||||||
|
STFR_MODEL: Final[SentenceTransformer] = SentenceTransformer(
|
||||||
|
'sentence-transformers/all-mpnet-base-v2', device=STFR_DEVICE
|
||||||
|
)
|
||||||
|
|
||||||
|
# ** spacy
|
||||||
|
SPCY_MODEL: Final[GermanSpacyModel] = spacy.load('de_dep_news_trf')
|
||||||
|
|
||||||
|
# ** export
|
||||||
# ** preprocessing
|
# ** preprocessing
|
||||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
|
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
|
||||||
'filename_cossim_filter_candidates'
|
'filename_cossim_filter_candidates'
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
import spacy
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
|
|
||||||
from lang_main.analysis.preprocessing import (
|
from lang_main.analysis.preprocessing import (
|
||||||
analyse_feature,
|
analyse_feature,
|
||||||
clean_string_slim,
|
clean_string_slim,
|
||||||
@ -24,6 +21,8 @@ from lang_main.constants import (
|
|||||||
FEATURE_NAME_OBJ_ID,
|
FEATURE_NAME_OBJ_ID,
|
||||||
MODEL_INPUT_FEATURES,
|
MODEL_INPUT_FEATURES,
|
||||||
SAVE_PATH_FOLDER,
|
SAVE_PATH_FOLDER,
|
||||||
|
SPCY_MODEL,
|
||||||
|
STFR_MODEL,
|
||||||
THRESHOLD_NUM_ACTIVITIES,
|
THRESHOLD_NUM_ACTIVITIES,
|
||||||
THRESHOLD_SIMILARITY,
|
THRESHOLD_SIMILARITY,
|
||||||
THRESHOLD_TIMELINE_SIMILARITY,
|
THRESHOLD_TIMELINE_SIMILARITY,
|
||||||
@ -49,6 +48,7 @@ pipe_target_feat.add(
|
|||||||
'target_feature': 'VorgangsBeschreibung',
|
'target_feature': 'VorgangsBeschreibung',
|
||||||
'cleansing_func': clean_string_slim,
|
'cleansing_func': clean_string_slim,
|
||||||
},
|
},
|
||||||
|
save_result=True,
|
||||||
)
|
)
|
||||||
pipe_target_feat.add(
|
pipe_target_feat.add(
|
||||||
analyse_feature,
|
analyse_feature,
|
||||||
@ -64,8 +64,7 @@ pipe_target_feat.add(
|
|||||||
# ?? still needed?
|
# ?? still needed?
|
||||||
# using similarity between entries to catch duplicates with typo or similar content
|
# using similarity between entries to catch duplicates with typo or similar content
|
||||||
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||||
model_spacy = spacy.load('de_dep_news_trf')
|
|
||||||
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
|
||||||
|
|
||||||
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||||
# pipe_embds.add(
|
# pipe_embds.add(
|
||||||
@ -88,7 +87,7 @@ pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
|||||||
pipe_merge.add(
|
pipe_merge.add(
|
||||||
merge_similarity_dupl,
|
merge_similarity_dupl,
|
||||||
{
|
{
|
||||||
'model': model_stfr,
|
'model': STFR_MODEL,
|
||||||
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
@ -99,7 +98,12 @@ pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_
|
|||||||
pipe_token_analysis.add(
|
pipe_token_analysis.add(
|
||||||
build_token_graph,
|
build_token_graph,
|
||||||
{
|
{
|
||||||
'model': model_spacy,
|
'model': SPCY_MODEL,
|
||||||
|
'target_feature': 'entry',
|
||||||
|
'weights_feature': 'num_occur',
|
||||||
|
'batch_idx_feature': 'batched_idxs',
|
||||||
|
'build_map': True,
|
||||||
|
'batch_size_model': 50,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
)
|
||||||
@ -135,7 +139,7 @@ pipe_timeline.add(
|
|||||||
pipe_timeline.add(
|
pipe_timeline.add(
|
||||||
get_timeline_candidates,
|
get_timeline_candidates,
|
||||||
{
|
{
|
||||||
'model': model_stfr,
|
'model': STFR_MODEL,
|
||||||
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
||||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
'model_input_feature': 'nlp_model_input',
|
'model_input_feature': 'nlp_model_input',
|
||||||
|
|||||||
@ -1,11 +1,12 @@
|
|||||||
import enum
|
import enum
|
||||||
from typing import TypeAlias
|
from typing import Required, TypeAlias, TypedDict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
|
|
||||||
|
# ** logging
|
||||||
class LoggingLevels(enum.IntEnum):
|
class LoggingLevels(enum.IntEnum):
|
||||||
DEBUG = 10
|
DEBUG = 10
|
||||||
INFO = 20
|
INFO = 20
|
||||||
@ -14,8 +15,50 @@ class LoggingLevels(enum.IntEnum):
|
|||||||
CRITICAL = 50
|
CRITICAL = 50
|
||||||
|
|
||||||
|
|
||||||
|
# ** devices
|
||||||
|
class STFRDeviceTypes(enum.StrEnum):
|
||||||
|
CPU = 'cpu'
|
||||||
|
GPU = 'cuda'
|
||||||
|
|
||||||
|
|
||||||
|
# ** datatsets
|
||||||
PandasIndex: TypeAlias = int | np.int64
|
PandasIndex: TypeAlias = int | np.int64
|
||||||
ObjectID: TypeAlias = int
|
ObjectID: TypeAlias = int
|
||||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||||
|
|
||||||
|
# ** graphs
|
||||||
|
NodeTitle: TypeAlias = str
|
||||||
|
EdgeWeight: TypeAlias = int
|
||||||
|
|
||||||
|
|
||||||
|
class NodeData(TypedDict):
|
||||||
|
id: NodeTitle
|
||||||
|
label: NodeTitle
|
||||||
|
|
||||||
|
|
||||||
|
class EdgeData(TypedDict):
|
||||||
|
source: NodeTitle
|
||||||
|
target: NodeTitle
|
||||||
|
weight: EdgeWeight
|
||||||
|
|
||||||
|
|
||||||
|
class WeightData(TypedDict):
|
||||||
|
min: EdgeWeight
|
||||||
|
max: EdgeWeight
|
||||||
|
|
||||||
|
|
||||||
|
class CytoscapePosition(TypedDict):
|
||||||
|
x: int
|
||||||
|
y: int
|
||||||
|
|
||||||
|
|
||||||
|
class CytoscapeData(TypedDict, total=False):
|
||||||
|
data: Required[EdgeData | NodeData]
|
||||||
|
position: CytoscapePosition
|
||||||
|
grabbable: bool
|
||||||
|
locked: bool
|
||||||
|
classes: str
|
||||||
|
|
||||||
|
|
||||||
|
# ** timeline
|
||||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||||
|
|||||||
@ -3087,7 +3087,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.8"
|
"version": "3.11.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -1077,7 +1077,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.8"
|
"version": "3.11.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -2267,7 +2267,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.8"
|
"version": "3.11.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
56
test-notebooks/lang_main_config.toml
Normal file
56
test-notebooks/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './inputs/'
|
||||||
|
results = './results/test_new2/'
|
||||||
|
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||||
|
#results = './results/Export7/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
|
#results = './results/Export7_trunc/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
[control]
|
||||||
|
preprocessing = true
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
time_analysis = false
|
||||||
|
time_analysis_skip = false
|
||||||
|
|
||||||
|
#[export_filenames]
|
||||||
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsTypName',
|
||||||
|
'VorgangsArtText',
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
@ -2327,7 +2327,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.8"
|
"version": "3.11.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
# lang_main: Config file
|
# lang_main: Config file
|
||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
inputs = '../inputs/'
|
inputs = '../scripts/inputs/'
|
||||||
results = './results/test_new2/'
|
results = '../scripts/results/test_new2/'
|
||||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
dataset = '../data/02_202307/Export4.csv'
|
||||||
#results = './results/Export7/'
|
#results = './results/Export7/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
#results = './results/Export7_trunc/'
|
#results = './results/Export7_trunc/'
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user