using dash-cytoscape

This commit is contained in:
Florian Förster 2024-06-05 16:37:23 +02:00
parent b3cc012791
commit b3e35e7dd1
33 changed files with 12332 additions and 110 deletions

15
pdm.lock generated
View File

@ -5,7 +5,7 @@
groups = ["default", "notebooks", "trials"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.1"
content_hash = "sha256:7574154c6728ede3eaf76a8b1a3b5d4339fcc8f2dc8c41042401004b6583e151"
content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98"
[[package]]
name = "annotated-types"
@ -432,6 +432,19 @@ files = [
{file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
]
[[package]]
name = "dash-cytoscape"
version = "1.0.1"
requires_python = ">=3.8"
summary = "A Component Library for Dash aimed at facilitating network visualization in Python, wrapped around Cytoscape.js"
groups = ["trials"]
dependencies = [
"dash",
]
files = [
{file = "dash_cytoscape-1.0.1.tar.gz", hash = "sha256:1bcd1587b2d8b432945585e2295e76393d3eb829f606c198693cd2b45bea6adc"},
]
[[package]]
name = "dash-html-components"
version = "2.0.0"

View File

@ -33,6 +33,7 @@ notebooks = [
trials = [
"plotly>=5.22.0",
"dash>=2.17.0",
"dash-cytoscape>=1.0.1",
]
[tool.ruff]

View File

@ -28,6 +28,8 @@ from lang_main.pipelines.predefined import (
)
from lang_main.types import (
ObjectID,
PandasIndex,
SpacyDoc,
TimelineCandidates,
)
from pandas import DataFrame, Series
@ -37,7 +39,7 @@ from pandas import DataFrame, Series
def run_preprocessing() -> DataFrame:
create_saving_folder(
saving_path_folder=SAVE_PATH_FOLDER,
overwrite_existing=True,
overwrite_existing=False,
)
# run pipelines
ret = typing.cast(
@ -56,15 +58,16 @@ def run_preprocessing() -> DataFrame:
def run_token_analysis(
preprocessed_data: DataFrame,
) -> TokenGraph:
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
# build token graph
(tk_graph,) = typing.cast(
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
(tk_graph, docs_mapping) = typing.cast(
tuple[TokenGraph, dict[PandasIndex, SpacyDoc]],
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
)
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph
return tk_graph, docs_mapping
def run_graph_postprocessing(
@ -127,9 +130,9 @@ def main() -> None:
'Preprocessing step skipped. Token analysis cannot be performed.'
)
preprocessed_data_trunc = typing.cast(
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
elif not SKIP_TOKEN_ANALYSIS:
# !! hardcoded result filenames
# whole graph

View File

@ -16,7 +16,6 @@ from dash import (
dcc,
html,
)
from lang_main import CALLER_PATH
from lang_main.io import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
@ -24,12 +23,8 @@ from pandas import DataFrame
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
# p_tl = Path(
# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
# )
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
@ -133,7 +128,7 @@ def update_timeline(index, obj_id):
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
df = data.loc[list(cands_choice)].sort_index() # type: ignore
# figure
fig = px.line(
data_frame=df,
@ -164,7 +159,7 @@ def update_table_candidates(index, obj_id):
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
df = data.loc[list(cands_choice)].sort_index() # type: ignore
df = df.filter(items=table_feats, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
)

203
scripts/dashboard/cyto.py Normal file
View File

@ -0,0 +1,203 @@
import time
import webbrowser
from pathlib import Path
from threading import Thread
from typing import cast
import dash_cytoscape as cyto
import lang_main.io
from dash import Dash, Input, Output, State, dcc, html
from lang_main.analysis import graphs
target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
p = Path(target).resolve()
ret = lang_main.io.load_pickle(p)
tk_graph = cast(graphs.TokenGraph, ret[0])
tk_graph_filtered = tk_graph.filter_by_edge_weight(150)
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
cyto_data, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
MIN_WEIGHT = weight_data['min']
MAX_WEIGHT = weight_data['max']
cyto.load_extra_layouts()
app = Dash(__name__)
my_stylesheet = [
# Group selectors
{
'selector': 'node',
'style': {
'shape': 'circle',
'content': 'data(label)',
'background-color': '#B10DC9',
'border-width': 2,
'border-color': 'black',
'border-opacity': 1,
'opacity': 1,
'color': 'black',
'text-opacity': 1,
'font-size': 12,
'z-index': 9999,
},
},
{
'selector': 'edge',
'style': {
'width': 2,
'curve-style': 'bezier',
'line-color': 'grey',
'line-style': 'solid',
'line-opacity': 1,
},
},
# Class selectors
# {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
]
app.layout = html.Div(
[
html.Button('Reset', id='bt-reset'),
dcc.Dropdown(
id='layout_choice_internal',
options=[
'random',
'grid',
'circle',
'concentric',
'breadthfirst',
'cose',
],
value='cose',
clearable=False,
),
dcc.Dropdown(
id='layout_choice_external',
options=[
'cose-bilkent',
'cola',
'euler',
'spread',
'dagre',
'klay',
],
clearable=False,
),
dcc.RangeSlider(
id='weight_slider',
min=MIN_WEIGHT,
max=MAX_WEIGHT,
step=1000,
),
cyto.Cytoscape(
id='cytoscape-graph',
layout={'name': 'cose'},
style={'width': '100%', 'height': '600px'},
stylesheet=my_stylesheet,
elements=cyto_data,
zoom=1,
),
]
)
@app.callback(
Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('layout_choice_internal', 'value'),
prevent_initial_call=True,
)
def update_layout_internal(layout_choice):
return {'name': layout_choice}
@app.callback(
Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('layout_choice_external', 'value'),
prevent_initial_call=True,
)
def update_layout_external(layout_choice):
return {'name': layout_choice}
@app.callback(
Output('cytoscape-graph', 'zoom'),
Output('cytoscape-graph', 'elements'),
Input('bt-reset', 'n_clicks'),
prevent_initial_call=True,
)
def reset_layout(n_clicks):
return (1, cyto_data)
# @app.callback(
# Output('cytoscape-graph', 'stylesheet'),
# Input('weight_slider', 'value'),
# State('cytoscape-graph', 'stylesheet'),
# prevent_initial_call=True,
# )
# def select_weight(range_chosen, stylesheet):
# min_weight, max_weight = range_chosen
# new_stylesheet = stylesheet.copy()
# new_stylesheet.append(
# {
# 'selector': f'[weight >= {min_weight}]',
# 'style': {'line-color': 'blue', 'line-style': 'dashed'},
# }
# )
# new_stylesheet.append(
# {
# 'selector': f'[weight <= {max_weight}]',
# 'style': {'line-color': 'blue', 'line-style': 'dashed'},
# }
# )
# return new_stylesheet
# app.layout = html.Div(
# [
# cyto.Cytoscape(
# id='cytoscape-two-nodes',
# layout={'name': 'preset'},
# style={'width': '100%', 'height': '400px'},
# stylesheet=my_stylesheet,
# elements=[
# {
# 'data': {
# 'id': 'one',
# 'label': 'Titel 1',
# },
# 'position': {'x': 75, 'y': 75},
# 'grabbable': False,
# #'locked': True,
# 'classes': 'red',
# },
# {
# 'data': {'id': 'two', 'label': 'Title 2'},
# 'position': {'x': 200, 'y': 200},
# 'classes': 'triangle',
# },
# {'data': {'source': 'one', 'target': 'two', 'weight': 2000}},
# ],
# )
# ]
# )
def _start_webbrowser():
host = '127.0.0.1'
port = '8050'
adress = f'http://{host}:{port}/'
time.sleep(2)
webbrowser.open_new(adress)
def main():
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
webbrowser_thread.start()
app.run(debug=True)
if __name__ == '__main__':
main()

368
scripts/dashboard/cyto_2.py Normal file
View File

@ -0,0 +1,368 @@
import json
import os
import dash
import dash_cytoscape as cyto
from dash import Input, Output, State, callback, dcc, html
# Load extra layouts
cyto.load_extra_layouts()
# Display utility functions
def _merge(a, b):
return dict(a, **b)
def _omit(omitted_keys, d):
return {k: v for k, v in d.items() if k not in omitted_keys}
# Custom Display Components
def Card(children, **kwargs):
return html.Section(
children,
style=_merge(
{
'padding': 20,
'margin': 5,
'borderRadius': 5,
'border': 'thin lightgrey solid',
'background-color': 'white',
# Remove possibility to select the text for better UX
'user-select': 'none',
'-moz-user-select': 'none',
'-webkit-user-select': 'none',
'-ms-user-select': 'none',
},
kwargs.get('style', {}),
),
**_omit(['style'], kwargs),
)
def SectionTitle(title, size, align='center', color='#222'):
return html.Div(
style={'text-align': align, 'color': color},
children=dcc.Markdown('#' * size + ' ' + title),
)
def NamedCard(title, size, children, **kwargs):
size = min(size, 6)
size = max(size, 1)
return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
def NamedSlider(name, **kwargs):
return html.Div(
style={'padding': '20px 10px 25px 4px'},
children=[
html.P(f'{name}:'),
html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
],
)
def NamedDropdown(name, **kwargs):
return html.Div(
style={'margin': '10px 0px'},
children=[
html.P(children=f'{name}:', style={'margin-left': '3px'}),
dcc.Dropdown(**kwargs),
],
)
def NamedRadioItems(name, **kwargs):
return html.Div(
style={'padding': '20px 10px 25px 4px'},
children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
)
def NamedInput(name, **kwargs):
return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
# Utils
def DropdownOptionsList(*args):
return [{'label': val.capitalize(), 'value': val} for val in args]
asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
app = dash.Dash(__name__, assets_folder=asset_path)
server = app.server
# ###################### DATA PREPROCESSING ######################
# Load data
with open('sample_network.txt', 'r', encoding='utf-8') as f:
network_data = f.read().split('\n')
# We select the first 750 edges and associated nodes for an easier visualization
edges = network_data[:750]
nodes = set()
following_node_di = {} # user id -> list of users they are following
following_edges_di = {} # user id -> list of cy edges starting from user id
followers_node_di = {} # user id -> list of followers (cy_node format)
followers_edges_di = {} # user id -> list of cy edges ending at user id
cy_edges = []
cy_nodes = []
for edge in edges:
if ' ' not in edge:
continue
source, target = edge.split(' ')
cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
if source not in nodes:
nodes.add(source)
cy_nodes.append(cy_source)
if target not in nodes:
nodes.add(target)
cy_nodes.append(cy_target)
# Process dictionary of following
if not following_node_di.get(source):
following_node_di[source] = []
if not following_edges_di.get(source):
following_edges_di[source] = []
following_node_di[source].append(cy_target)
following_edges_di[source].append(cy_edge)
# Process dictionary of followers
if not followers_node_di.get(target):
followers_node_di[target] = []
if not followers_edges_di.get(target):
followers_edges_di[target] = []
followers_node_di[target].append(cy_source)
followers_edges_di[target].append(cy_edge)
genesis_node = cy_nodes[0]
genesis_node['classes'] = 'genesis'
default_elements = [genesis_node]
default_stylesheet = [
{'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
{
'selector': 'edge',
'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
},
{'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
{
'selector': '.followerEdge',
'style': {
'mid-target-arrow-color': 'blue',
'mid-target-arrow-shape': 'vee',
'line-color': '#0074D9',
},
},
{'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
{
'selector': '.followingEdge',
'style': {
'mid-target-arrow-color': 'red',
'mid-target-arrow-shape': 'vee',
'line-color': '#FF4136',
},
},
{
'selector': '.genesis',
'style': {
'background-color': '#B10DC9',
'border-width': 2,
'border-color': 'purple',
'border-opacity': 1,
'opacity': 1,
'label': 'data(label)',
'color': '#B10DC9',
'text-opacity': 1,
'font-size': 12,
'z-index': 9999,
},
},
{
'selector': ':selected',
'style': {
'border-width': 2,
'border-color': 'black',
'border-opacity': 1,
'opacity': 1,
'label': 'data(label)',
'color': 'black',
'font-size': 12,
'z-index': 9999,
},
},
]
# ################################# APP LAYOUT ################################
styles = {
'json-output': {
'overflow-y': 'scroll',
'height': 'calc(50% - 25px)',
'border': 'thin lightgrey solid',
},
'tab': {'height': 'calc(98vh - 80px)'},
}
app.layout = html.Div(
[
html.Div(
className='eight columns',
children=[
cyto.Cytoscape(
id='cytoscape',
elements=default_elements,
stylesheet=default_stylesheet,
style={'height': '95vh', 'width': '100%'},
)
],
),
html.Div(
className='four columns',
children=[
dcc.Tabs(
id='tabs',
children=[
dcc.Tab(
label='Control Panel',
children=[
NamedDropdown(
name='Layout',
id='dropdown-layout',
options=DropdownOptionsList(
'random',
'grid',
'circle',
'concentric',
'breadthfirst',
'cose',
'cose-bilkent',
'dagre',
'cola',
'klay',
'spread',
'euler',
),
value='grid',
clearable=False,
),
NamedRadioItems(
name='Expand',
id='radio-expand',
options=DropdownOptionsList('followers', 'following'),
value='followers',
),
],
),
dcc.Tab(
label='JSON',
children=[
html.Div(
style=styles['tab'],
children=[
html.P('Node Object JSON:'),
html.Pre(
id='tap-node-json-output',
style=styles['json-output'],
),
html.P('Edge Object JSON:'),
html.Pre(
id='tap-edge-json-output',
style=styles['json-output'],
),
],
)
],
),
],
),
],
),
]
)
# ############################## CALLBACKS ####################################
@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
def display_tap_node(data):
return json.dumps(data, indent=2)
@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
def display_tap_edge(data):
return json.dumps(data, indent=2)
@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
def update_cytoscape_layout(layout):
return {'name': layout}
@callback(
Output('cytoscape', 'elements'),
Input('cytoscape', 'tapNodeData'),
State('cytoscape', 'elements'),
State('radio-expand', 'value'),
)
def generate_elements(nodeData, elements, expansion_mode):
if not nodeData:
return default_elements
# If the node has already been expanded, we don't expand it again
if nodeData.get('expanded'):
return elements
# This retrieves the currently selected element, and tag it as expanded
for element in elements:
if nodeData['id'] == element.get('data').get('id'):
element['data']['expanded'] = True
break
if expansion_mode == 'followers':
followers_nodes = followers_node_di.get(nodeData['id'])
followers_edges = followers_edges_di.get(nodeData['id'])
if followers_nodes:
for node in followers_nodes:
node['classes'] = 'followerNode'
elements.extend(followers_nodes)
if followers_edges:
for follower_edge in followers_edges:
follower_edge['classes'] = 'followerEdge'
elements.extend(followers_edges)
elif expansion_mode == 'following':
following_nodes = following_node_di.get(nodeData['id'])
following_edges = following_edges_di.get(nodeData['id'])
if following_nodes:
for node in following_nodes:
if node['data']['id'] != genesis_node['data']['id']:
node['classes'] = 'followingNode'
elements.append(node)
if following_edges:
for follower_edge in following_edges:
follower_edge['classes'] = 'followingEdge'
elements.extend(following_edges)
return elements
if __name__ == '__main__':
app.run_server(debug=True)

File diff suppressed because it is too large Load Diff

View File

@ -10,14 +10,14 @@ dataset = '../data/02_202307/Export4.csv'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = true
token_analysis = false
token_analysis_skip = true
preprocessing = false
preprocessing_skip = false
token_analysis = true
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = true
time_analysis = true
time_analysis_skip = false
time_analysis = false
time_analysis_skip = true
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'

View File

@ -1,9 +1,15 @@
from pathlib import Path
from lang_main.constants import (
INPUT_PATH_FOLDER,
PATH_TO_DATASET,
SAVE_PATH_FOLDER,
input_path_conf,
)
print(SAVE_PATH_FOLDER, '\n')
print(INPUT_PATH_FOLDER, '\n')
print(PATH_TO_DATASET, '\n')
print('------------------------')
print(Path.cwd(), '\n', input_path_conf)

View File

@ -0,0 +1,51 @@
import inspect
import logging
import shutil
import sys
from pathlib import Path
from time import gmtime
from typing import Any, Final
import warnings
from lang_main.io import load_toml_config
__all__ = [
'CALLER_PATH',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
format=LOG_FMT,
datefmt=LOG_DATE_FMT,
)
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = True
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
caller_file = Path(inspect.stack()[-1].filename)
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
# load config data: internal/external
if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
if not caller_file.exists():
warnings.warn('Caller file could not be correctly retrieved.')
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(
(
'No config file was found. A new one with default values was created '
'in the execution path. Please fill in the necessary values and '
'restart the programm.'
)
)
# raise NotImplementedError("External config data not implemented yet.")
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()

View File

@ -1,4 +1,3 @@
import inspect
import logging
import shutil
import sys
@ -8,10 +7,6 @@ from typing import Any, Final
from lang_main.io import load_toml_config
__all__ = [
'CALLER_PATH',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
@ -24,17 +19,15 @@ logging.basicConfig(
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
caller_file = Path(inspect.stack()[-1].filename)
CALLER_PATH: Final[Path] = caller_file.parent.resolve()
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
# caller_file = Path(inspect.stack()[-1].filename)
# CALLER_PATH: Final[Path] = caller_file.parent.resolve()
# load config data: internal/external
if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
if not caller_file.exists():
raise FileNotFoundError('Caller file could not be correctly retrieved.')
cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(

View File

@ -3,7 +3,7 @@ import sys
import typing
from collections.abc import Hashable, Iterable
from pathlib import Path
from typing import Any, Final, Literal, Self, overload
from typing import Any, Final, Literal, Self, cast, overload
import networkx as nx
import numpy as np
@ -13,6 +13,12 @@ from pandas import DataFrame
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_graphs as logger
from lang_main.types import (
CytoscapeData,
EdgeWeight,
NodeTitle,
WeightData,
)
# TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False
@ -67,7 +73,7 @@ def update_graph(
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
parent: Hashable | None = None,
child: Hashable | None = None,
weight_connection: int = 1,
weight_connection: int | None = None,
) -> None:
# !! not necessary to check for existence of nodes
# !! feature already implemented in NetworkX ``add_edge``
@ -78,6 +84,8 @@ def update_graph(
if child not in graph:
graph.add_node(child)
"""
if weight_connection is None:
weight_connection = 1
# check if edge not in Graph
if batch is not None:
graph.add_edges_from(batch, weight=weight_connection)
@ -116,6 +124,51 @@ def convert_graph_to_undirected(
return graph_undir
def convert_graph_to_cytoscape(
graph: Graph | DiGraph,
) -> tuple[list[CytoscapeData], WeightData]:
cyto_data: list[CytoscapeData] = []
# iterate over nodes
nodes = cast(Iterable[NodeTitle], graph.nodes)
for i, node in enumerate(nodes):
node_data: CytoscapeData = {
'data': {
'id': node,
'label': node,
}
}
cyto_data.append(node_data)
# iterate over edges
weights: set[int] = set()
edges = cast(
Iterable[
tuple[
NodeTitle,
NodeTitle,
EdgeWeight,
]
],
graph.edges.data('weight', default=1), # type: ignore
)
for i, (source, target, weight) in enumerate(edges):
weights.add(weight)
edge_data: CytoscapeData = {
'data': {
'source': source,
'target': target,
'weight': weight,
}
}
cyto_data.append(edge_data)
min_weight = min(weights)
max_weight = max(weights)
weight_metadata: WeightData = {'min': min_weight, 'max': max_weight}
return cyto_data, weight_metadata
class TokenGraph(DiGraph):
def __init__(
self,
@ -200,7 +253,9 @@ class TokenGraph(DiGraph):
@overload
def to_undirected(
self, inplace: bool = ..., logging: bool | None = ...
self,
inplace: bool = ...,
logging: bool | None = ...,
) -> Graph | None: ...
def to_undirected(

View File

@ -214,20 +214,23 @@ def analyse_feature(
unique_feature_entries = feature_entries.unique()
# prepare result DataFrame
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
cols = ['batched_idxs', 'entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry)
filt = data[target_feature] == entry
temp = data[filt]
batched_idxs = temp.index.to_numpy()
assoc_obj_ids = temp['ObjektID'].unique()
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum()
conc_df = pd.DataFrame(
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
data=[
[batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
],
columns=cols,
)

View File

@ -10,7 +10,6 @@ from networkx import Graph
from pandas import Series
from sentence_transformers import SentenceTransformer
from torch import Tensor
from tqdm.auto import tqdm
from lang_main.analysis.graphs import get_graph_metadata, update_graph
from lang_main.types import PandasIndex
@ -40,9 +39,8 @@ def candidates_by_index(
Yields
------
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
ObjectID and tuple of index pairs which meet the cosine
similarity threshold
Iterator[tuple[PandasIndex, PandasIndex]]
tuple of index pairs which meet the cosine similarity threshold
"""
# embeddings
batch = cast(list[str], data_model_input.to_list())

View File

@ -1,11 +1,11 @@
import re
from collections.abc import Iterator
from itertools import combinations
from typing import cast
from typing import Literal, cast, overload
from dateutil.parser import parse
from pandas import DataFrame
from spacy.lang.de import German as GermanSpacyModel
from spacy.language import Language as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.tokens.token import Token as SpacyToken
from tqdm.auto import tqdm
@ -15,6 +15,7 @@ from lang_main.analysis.graphs import (
update_graph,
)
from lang_main.loggers import logger_token_analysis as logger
from lang_main.types import PandasIndex
# ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
@ -104,7 +105,7 @@ def obtain_relevant_descendants(
def add_doc_info_to_graph(
graph: TokenGraph,
doc: SpacyDoc,
weight: int,
weight: int | None,
) -> None:
# iterate over sentences
for sent in doc.sents:
@ -142,9 +143,121 @@ def add_doc_info_to_graph(
)
@overload
def build_token_graph(
data: DataFrame,
model: GermanSpacyModel,
*,
target_feature: str = ...,
weights_feature: str | None = ...,
batch_idx_feature: str = ...,
build_map: Literal[False],
batch_size_model: int = ...,
) -> tuple[TokenGraph, None]: ...
@overload
def build_token_graph(
data: DataFrame,
model: GermanSpacyModel,
*,
target_feature: str = ...,
weights_feature: str | None = ...,
batch_idx_feature: str = ...,
build_map: Literal[True] = ...,
batch_size_model: int = ...,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: ...
def build_token_graph(
data: DataFrame,
model: GermanSpacyModel,
*,
target_feature: str = 'entry',
weights_feature: str | None = None,
batch_idx_feature: str = 'batched_idxs',
build_map: bool = True,
batch_size_model: int = 50,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
graph = TokenGraph()
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
if weights_feature is not None:
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
else:
weights = None
docs_mapping: dict[PandasIndex, SpacyDoc] | None
if build_map:
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
docs_mapping = {}
else:
indices = None
docs_mapping = None
index: int = 0
for doc in tqdm(
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
):
if weights is not None:
weight = weights[index]
else:
weight = None
add_doc_info_to_graph(
graph=graph,
doc=doc,
weight=weight,
)
# build map if option chosen
if indices is not None and docs_mapping is not None:
corresponding_indices = indices[index]
for idx in corresponding_indices:
docs_mapping[idx] = doc
index += 1
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected()
return graph, docs_mapping
def build_token_graph_simple(
data: DataFrame,
model: GermanSpacyModel,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
graph = TokenGraph()
model_input = cast(tuple[str], tuple(data['entry'].to_list()))
weights = cast(tuple[int], tuple(data['num_occur'].to_list()))
indices = cast(tuple[list[PandasIndex]], tuple(data['batched_idxs'].to_list()))
index: int = 0
docs_mapping: dict[PandasIndex, SpacyDoc] = {}
for doc in tqdm(model.pipe(model_input, batch_size=50), total=len(model_input)):
add_doc_info_to_graph(
graph=graph,
doc=doc,
weight=weights[index],
)
corresponding_indices = indices[index]
for idx in corresponding_indices:
docs_mapping[idx] = doc
index += 1
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected()
return graph, docs_mapping
def build_token_graph_old(
data: DataFrame,
model: GermanSpacyModel,
) -> tuple[TokenGraph]:
# empty NetworkX directed graph
# graph = nx.DiGraph()

View File

@ -1,15 +1,28 @@
from pathlib import Path
from typing import Final
from lang_main import CALLER_PATH, CONFIG
import spacy
from sentence_transformers import SentenceTransformer
from spacy.language import Language as GermanSpacyModel
from lang_main import CONFIG
from lang_main.types import STFRDeviceTypes
# ** paths
input_path_conf = Path(CONFIG['paths']['inputs'])
INPUT_PATH_FOLDER: Final[Path] = (CALLER_PATH / input_path_conf).resolve()
save_path_conf = Path(CONFIG['paths']['results'])
SAVE_PATH_FOLDER: Final[Path] = (CALLER_PATH / save_path_conf).resolve()
path_dataset_conf = Path(CONFIG['paths']['dataset'])
PATH_TO_DATASET: Final[Path] = (CALLER_PATH / path_dataset_conf).resolve()
input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs'])
INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve()
# INPUT_PATH_FOLDER: Final[Path] = (CALLER_PATH / input_path_conf).resolve()
# TODO reactivate later
# if not INPUT_PATH_FOLDER.exists():
# raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
save_path_conf = Path.cwd() / Path(CONFIG['paths']['results'])
SAVE_PATH_FOLDER: Final[Path] = save_path_conf.resolve()
# SAVE_PATH_FOLDER: Final[Path] = (CALLER_PATH / save_path_conf).resolve()
path_dataset_conf = Path.cwd() / Path(CONFIG['paths']['dataset'])
PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
# PATH_TO_DATASET: Final[Path] = (CALLER_PATH / path_dataset_conf).resolve()
# if not PATH_TO_DATASET.exists():
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
# ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
@ -19,8 +32,18 @@ DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** export
# ** models
# ** sentence_transformers
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
STFR_MODEL: Final[SentenceTransformer] = SentenceTransformer(
'sentence-transformers/all-mpnet-base-v2', device=STFR_DEVICE
)
# ** spacy
SPCY_MODEL: Final[GermanSpacyModel] = spacy.load('de_dep_news_trf')
# ** export
# ** preprocessing
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
'filename_cossim_filter_candidates'

View File

@ -1,6 +1,3 @@
import spacy
from sentence_transformers import SentenceTransformer
from lang_main.analysis.preprocessing import (
analyse_feature,
clean_string_slim,
@ -24,6 +21,8 @@ from lang_main.constants import (
FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES,
SAVE_PATH_FOLDER,
SPCY_MODEL,
STFR_MODEL,
THRESHOLD_NUM_ACTIVITIES,
THRESHOLD_SIMILARITY,
THRESHOLD_TIMELINE_SIMILARITY,
@ -49,6 +48,7 @@ pipe_target_feat.add(
'target_feature': 'VorgangsBeschreibung',
'cleansing_func': clean_string_slim,
},
save_result=True,
)
pipe_target_feat.add(
analyse_feature,
@ -64,8 +64,7 @@ pipe_target_feat.add(
# ?? still needed?
# using similarity between entries to catch duplicates with typo or similar content
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
model_spacy = spacy.load('de_dep_news_trf')
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
# pipe_embds.add(
@ -88,7 +87,7 @@ pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
pipe_merge.add(
merge_similarity_dupl,
{
'model': model_stfr,
'model': STFR_MODEL,
'cos_sim_threshold': THRESHOLD_SIMILARITY,
},
save_result=True,
@ -99,7 +98,12 @@ pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_
pipe_token_analysis.add(
build_token_graph,
{
'model': model_spacy,
'model': SPCY_MODEL,
'target_feature': 'entry',
'weights_feature': 'num_occur',
'batch_idx_feature': 'batched_idxs',
'build_map': True,
'batch_size_model': 50,
},
save_result=True,
)
@ -135,7 +139,7 @@ pipe_timeline.add(
pipe_timeline.add(
get_timeline_candidates,
{
'model': model_stfr,
'model': STFR_MODEL,
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
'model_input_feature': 'nlp_model_input',

View File

@ -1,11 +1,12 @@
import enum
from typing import TypeAlias
from typing import Required, TypeAlias, TypedDict
import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
# ** logging
class LoggingLevels(enum.IntEnum):
DEBUG = 10
INFO = 20
@ -14,8 +15,50 @@ class LoggingLevels(enum.IntEnum):
CRITICAL = 50
# ** devices
class STFRDeviceTypes(enum.StrEnum):
CPU = 'cpu'
GPU = 'cuda'
# ** datatsets
PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int
Embedding: TypeAlias = SpacyDoc | Tensor
# ** graphs
NodeTitle: TypeAlias = str
EdgeWeight: TypeAlias = int
class NodeData(TypedDict):
id: NodeTitle
label: NodeTitle
class EdgeData(TypedDict):
source: NodeTitle
target: NodeTitle
weight: EdgeWeight
class WeightData(TypedDict):
min: EdgeWeight
max: EdgeWeight
class CytoscapePosition(TypedDict):
x: int
y: int
class CytoscapeData(TypedDict, total=False):
data: Required[EdgeData | NodeData]
position: CytoscapePosition
grabbable: bool
locked: bool
classes: str
# ** timeline
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]

View File

@ -3087,7 +3087,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,

View File

@ -1077,7 +1077,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,

View File

@ -2267,7 +2267,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -2327,7 +2327,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,

View File

@ -1,9 +1,9 @@
# lang_main: Config file
[paths]
inputs = '../inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
inputs = '../scripts/inputs/'
results = '../scripts/results/test_new2/'
dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'