new pipeline management, proto graph display timeline

This commit is contained in:
Florian Förster 2024-06-19 16:58:26 +02:00
parent c2714b8060
commit fb4437a3a2
21 changed files with 2838 additions and 11383 deletions

8
pdm.lock generated
View File

@ -5,7 +5,7 @@
groups = ["default", "notebooks", "trials"] groups = ["default", "notebooks", "trials"]
strategy = ["cross_platform", "inherit_metadata"] strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.1" lock_version = "4.4.1"
content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98" content_hash = "sha256:e00f157f833ee7615d96375c352e2caa6b4f6b50e5615ccbefa79446189594c7"
[[package]] [[package]]
name = "annotated-types" name = "annotated-types"
@ -2938,13 +2938,13 @@ files = [
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.11.0" version = "4.12.2"
requires_python = ">=3.8" requires_python = ">=3.8"
summary = "Backported and Experimental Type Hints for Python 3.8+" summary = "Backported and Experimental Type Hints for Python 3.8+"
groups = ["default", "notebooks", "trials"] groups = ["default", "notebooks", "trials"]
files = [ files = [
{file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
{file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
] ]
[[package]] [[package]]

View File

@ -12,6 +12,7 @@ dependencies = [
"sentence-transformers>=2.7.0", "sentence-transformers>=2.7.0",
"numpy>=1.26.4", "numpy>=1.26.4",
"pip>=24.0", "pip>=24.0",
"typing-extensions>=4.12.2",
] ]
requires-python = ">=3.11" requires-python = ">=3.11"
readme = "README.md" readme = "README.md"
@ -48,3 +49,6 @@ skip-magic-trailing-comma = false
[tool.ruff.lint] [tool.ruff.lint]
select = ["E", "F", "I"] select = ["E", "F", "I"]
[tool.ruff.lint.isort]
extra-standard-library = ["typing_extensions"]

View File

@ -1,42 +1,44 @@
import typing import typing
import warnings
from pathlib import Path
from typing import cast from typing import cast
from pandas import DataFrame, Series
from lang_main.analysis.graphs import TokenGraph from lang_main.analysis.graphs import TokenGraph
from lang_main.constants import ( from lang_main.constants import (
DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING,
DO_TIME_ANALYSIS,
DO_TOKEN_ANALYSIS,
INPUT_PATH_FOLDER,
PATH_TO_DATASET, PATH_TO_DATASET,
SAVE_PATH_FOLDER, SAVE_PATH_FOLDER,
SKIP_GRAPH_POSTPROCESSING, SKIP_GRAPH_POSTPROCESSING,
SKIP_PREPROCESSING, SKIP_PREPROCESSING,
SKIP_TIME_ANALYSIS, SKIP_TIME_ANALYSIS,
SKIP_TOKEN_ANALYSIS, SKIP_TOKEN_ANALYSIS,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
) )
from lang_main.io import create_saving_folder, load_pickle from lang_main.io import create_saving_folder, get_entry_point, load_pickle
from lang_main.pipelines.base import PipelineContainer
from lang_main.pipelines.predefined import ( from lang_main.pipelines.predefined import (
pipe_merge, build_base_target_feature_pipe,
pipe_target_feat, build_merge_duplicates_pipe,
pipe_timeline, build_timeline_pipe,
pipe_token_analysis, build_tk_graph_pipe,
build_tk_graph_post_pipe,
) )
from lang_main.types import ( from lang_main.types import (
EntryPoints,
ObjectID, ObjectID,
PandasIndex, PandasIndex,
SpacyDoc, SpacyDoc,
TimelineCandidates, TimelineCandidates,
) )
from pandas import DataFrame, Series
# ** build pipelines
pipe_merge = build_merge_duplicates_pipe()
pipe_target_feat = build_base_target_feature_pipe()
pipe_timeline = build_timeline_pipe()
pipe_token_analysis = build_tk_graph_pipe()
pipe_graph_postprocessing = build_tk_graph_post_pipe()
# ** processing pipeline # ** preprocessing pipeline
def run_preprocessing() -> DataFrame: def run_preprocessing() -> None:
create_saving_folder( create_saving_folder(
saving_path_folder=SAVE_PATH_FOLDER, saving_path_folder=SAVE_PATH_FOLDER,
overwrite_existing=False, overwrite_existing=False,
@ -46,134 +48,69 @@ def run_preprocessing() -> DataFrame:
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)) tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
) )
target_feat_data = ret[0] target_feat_data = ret[0]
# only entries with more than threshold amount of characters _ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
subset_data = target_feat_data.loc[data_filter].copy()
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
preprocessed_data = ret[0]
return preprocessed_data
def run_token_analysis( # ** token analysis
preprocessed_data: DataFrame, def run_token_analysis() -> None:
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: # load entry point
entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TOKEN_ANALYSIS)
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
preprocessed_data = loaded_results[0]
# build token graph # build token graph
(tk_graph, docs_mapping) = typing.cast( (tk_graph, docs_mapping) = typing.cast(
tuple[TokenGraph, dict[PandasIndex, SpacyDoc]], tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
pipe_token_analysis.run(starting_values=(preprocessed_data,)), pipe_token_analysis.run(starting_values=(preprocessed_data,)),
) )
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False) tk_graph.to_GraphML(SAVE_PATH_FOLDER, filename='TokenGraph', directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph, docs_mapping
def run_graph_postprocessing( def run_graph_postprocessing() -> None:
tk_graph: TokenGraph, # load entry point
) -> TokenGraph: entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TK_GRAPH_POST)
loaded_results = cast(
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
load_pickle(entry_point_path),
)
tk_graph = loaded_results[0]
# filter graph by edge weight and remove single nodes (no connection) # filter graph by edge weight and remove single nodes (no connection)
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT) ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) tk_graph_filtered = ret[0]
tk_graph_filtered.save_graph( # tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT, None)
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
tk_graph_filtered.to_GraphML(
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
) )
tk_graph_filtered.to_pickle(
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
)
return tk_graph_filtered
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]: # ** time analysis
filename = 'without_nan' def run_time_analysis() -> None:
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') # load entry point
verify_path(loading_path) entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
ret = load_pickle(loading_path) loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
preprocessed_data = ret[0] preprocessed_data = loaded_results[0]
ret = cast( _ = cast(
tuple[TimelineCandidates, dict[ObjectID, str]], tuple[TimelineCandidates, dict[ObjectID, str]],
pipe_timeline.run(starting_values=(preprocessed_data,)), pipe_timeline.run(starting_values=(preprocessed_data,)),
) )
return ret
def verify_path( def build_pipeline_container() -> PipelineContainer:
loading_path: Path, container = PipelineContainer(
) -> None: name='Pipeline-Container-Base', working_dir=SAVE_PATH_FOLDER
if not loading_path.exists(): )
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}') container.add(run_preprocessing, skip=SKIP_PREPROCESSING)
container.add(run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
container.add(run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
container.add(run_time_analysis, skip=SKIP_TIME_ANALYSIS)
return container
def main() -> None: def main() -> None:
pre_step_skipped: bool = False procedure = build_pipeline_container()
# ** preprocess procedure.run()
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
preprocessed_data = run_preprocessing()
elif not SKIP_PREPROCESSING:
# !! hardcoded result filenames
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
verify_path(loading_path)
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
preprocessed_data = ret[0]
else:
pre_step_skipped = True
warnings.warn('No preprocessing action selected. Skipped.')
# sys.exit(0)
# ** token analysis
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
if pre_step_skipped:
raise RuntimeError(
'Preprocessing step skipped. Token analysis cannot be performed.'
)
preprocessed_data_trunc = typing.cast(
DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
) # type: ignore
tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
elif not SKIP_TOKEN_ANALYSIS:
# !! hardcoded result filenames
# whole graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
pre_step_skipped = True
warnings.warn('No token analysis action selected. Skipped.')
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
if pre_step_skipped:
raise RuntimeError(
(
'Preprocessing or token analysis step skipped. '
'Graph postprocessing cannot be performed.'
)
)
tk_graph_filtered = run_graph_postprocessing(tk_graph)
elif not SKIP_GRAPH_POSTPROCESSING:
# !! hardcoded result filenames
# filtered graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
warnings.warn('No graph postprocessing action selected. Skipped.')
# ** time analysis
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
# no check for fails, runs separately
ret = run_time_analysis()
elif not SKIP_TIME_ANALYSIS:
...
else:
warnings.warn('No time analysis action selected. Skipped.')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,190 +0,0 @@
import time
import webbrowser
from pathlib import Path
from threading import Thread
from typing import cast
import pandas as pd
import plotly.express as px
from dash import (
Dash,
Input,
Output,
State,
callback,
dash_table,
dcc,
html,
)
from lang_main.io import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
cands = ret[0]
texts = ret[1]
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
# data = cast(DataFrame, load_pickle(p_df))
# cands = cast(TimelineCandidates, load_pickle(p_cands))
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
table_feats = [
'ErstellungsDatum',
'ErledigungsDatum',
'VorgangsTypName',
'VorgangsBeschreibung',
]
table_feats_dates = [
'ErstellungsDatum',
'ErledigungsDatum',
]
# ** graph config
markers = {
'size': 12,
'color': 'yellow',
'line': {
'width': 2,
'color': 'red',
},
}
hover_data = {
'ErstellungsDatum': '|%d.%m.%Y',
'VorgangsBeschreibung': True,
}
app = Dash(prevent_initial_callbacks=True)
app.layout = [
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
html.Div(
children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder='ObjektID auswählen...',
),
]
),
html.Div(
children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]
),
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
]
@callback(
Output('object_text', 'children'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_obj_text(obj_id):
obj_id = int(obj_id)
obj_text = texts[obj_id]
headline = f'HObjektText: {obj_text}'
return headline
@callback(
Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_choice_candidates(obj_id):
obj_id = int(obj_id)
cands_obj_id = cands[obj_id]
choices = list(range(1, len(cands_obj_id) + 1))
return choices
@callback(
Output('graph-output', 'figure'),
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_timeline(index, obj_id):
obj_id = int(obj_id)
# title
obj_text = texts[obj_id]
title = f'HObjektText: {obj_text}'
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index() # type: ignore
# figure
fig = px.line(
data_frame=df,
x='ErstellungsDatum',
y='ObjektID',
title=title,
hover_data=hover_data,
)
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
fig.update_xaxes(
tickformat='%B\n%Y',
rangeslider_visible=True,
)
fig.update_yaxes(type='category')
fig.update_layout(hovermode='x unified')
return fig
@callback(
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_table_candidates(index, obj_id):
obj_id = int(obj_id)
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index() # type: ignore
df = df.filter(items=table_feats, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
)
cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings
for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records')
return table_data, cols
def _start_webbrowser():
host = '127.0.0.1'
port = '8050'
adress = f'http://{host}:{port}/'
time.sleep(2)
webbrowser.open_new(adress)
def main():
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
webbrowser_thread.start()
app.run(debug=True)
if __name__ == '__main__':
main()

View File

@ -1,9 +1,9 @@
import copy
import time import time
import webbrowser import webbrowser
from pathlib import Path from pathlib import Path
from threading import Thread from threading import Thread
from typing import cast from typing import cast
import copy
import dash_cytoscape as cyto import dash_cytoscape as cyto
from dash import Dash, Input, Output, State, dcc, html from dash import Dash, Input, Output, State, dcc, html
@ -30,20 +30,20 @@ app = Dash(__name__, external_stylesheets=external_stylesheets)
cose_layout = { cose_layout = {
'name': 'cose', 'name': 'cose',
'nodeOverlap': 20, 'nodeOverlap': 500,
'refresh': 20, 'refresh': 20,
'fit': True, 'fit': True,
'padding': 30, 'padding': 20,
'randomize': True, 'randomize': False,
'componentSpacing': 40, 'componentSpacing': 1.2,
'nodeRepulsion': 2000, 'nodeRepulsion': 1000,
'edgeElasticity': 1000, 'edgeElasticity': 1000,
'idealEdgeLength': 100, 'idealEdgeLength': 100,
'nestingFactor': 1.2, 'nestingFactor': 1.2,
'gravity': 50, 'gravity': 50,
'numIter': 2000, 'numIter': 3000,
'initialTemp': 1000, 'initialTemp': 2000,
'coolingFactor': 0.95, 'coolingFactor': 0.7,
'minTemp': 1.0, 'minTemp': 1.0,
'nodeDimensionsIncludeLabels': True, 'nodeDimensionsIncludeLabels': True,
} }
@ -108,9 +108,8 @@ my_stylesheet = [
# {'selector': '.triangle', 'style': {'shape': 'triangle'}}, # {'selector': '.triangle', 'style': {'shape': 'triangle'}},
] ]
app.layout = html.Div( layout = html.Div(
[ [
html.Button('Trigger JS Layout', id='test_js'),
html.Button('Trigger JS Weight', id='test_js_weight'), html.Button('Trigger JS Weight', id='test_js_weight'),
html.Div(id='output'), html.Div(id='output'),
html.Div( html.Div(
@ -166,11 +165,13 @@ app.layout = html.Div(
style={'width': '40%'}, style={'width': '40%'},
), ),
html.H3('Graph'), html.H3('Graph'),
html.Button('Re-Layout', id='trigger_relayout'),
html.Div( html.Div(
[ [
cyto.Cytoscape( cyto.Cytoscape(
id='cytoscape-graph', id='cytoscape-graph',
style={'width': '100%', 'height': '600px'}, style={'width': '100%', 'height': '600px'},
layout=cose_layout,
stylesheet=my_stylesheet, stylesheet=my_stylesheet,
elements=cyto_data_base, elements=cyto_data_base,
zoom=1, zoom=1,
@ -192,6 +193,9 @@ app.layout = html.Div(
) )
app.layout = layout
@app.callback( @app.callback(
Output('cytoscape-graph', 'layout', allow_duplicate=True), Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('layout_choice', 'value'), Input('layout_choice', 'value'),
@ -266,17 +270,17 @@ app.clientside_callback(
""" """
function(n_clicks, layout) { function(n_clicks, layout) {
layout.edgeElasticity = function(edge) { layout.edgeElasticity = function(edge) {
return edge.data().weight * 4; return edge.data().weight * 0.05;
}; };
layout.idealEdgeLength = function(edge) { layout.idealEdgeLength = function(edge) {
return edge.data().weight * 0.8; return edge.data().weight * 0.4;
}; };
cy.layout(layout).run(); cy.layout(layout).run();
return layout; return layout;
} }
""", """,
Output('cytoscape-graph', 'layout', allow_duplicate=True), Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('test_js', 'n_clicks'), Input('trigger_relayout', 'n_clicks'),
State('cytoscape-graph', 'layout'), State('cytoscape-graph', 'layout'),
prevent_initial_call=True, prevent_initial_call=True,
) )

View File

@ -1,368 +0,0 @@
import json
import os
import dash
import dash_cytoscape as cyto
from dash import Input, Output, State, callback, dcc, html
# Load extra layouts
cyto.load_extra_layouts()
# Display utility functions
def _merge(a, b):
return dict(a, **b)
def _omit(omitted_keys, d):
return {k: v for k, v in d.items() if k not in omitted_keys}
# Custom Display Components
def Card(children, **kwargs):
return html.Section(
children,
style=_merge(
{
'padding': 20,
'margin': 5,
'borderRadius': 5,
'border': 'thin lightgrey solid',
'background-color': 'white',
# Remove possibility to select the text for better UX
'user-select': 'none',
'-moz-user-select': 'none',
'-webkit-user-select': 'none',
'-ms-user-select': 'none',
},
kwargs.get('style', {}),
),
**_omit(['style'], kwargs),
)
def SectionTitle(title, size, align='center', color='#222'):
return html.Div(
style={'text-align': align, 'color': color},
children=dcc.Markdown('#' * size + ' ' + title),
)
def NamedCard(title, size, children, **kwargs):
size = min(size, 6)
size = max(size, 1)
return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
def NamedSlider(name, **kwargs):
return html.Div(
style={'padding': '20px 10px 25px 4px'},
children=[
html.P(f'{name}:'),
html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
],
)
def NamedDropdown(name, **kwargs):
return html.Div(
style={'margin': '10px 0px'},
children=[
html.P(children=f'{name}:', style={'margin-left': '3px'}),
dcc.Dropdown(**kwargs),
],
)
def NamedRadioItems(name, **kwargs):
return html.Div(
style={'padding': '20px 10px 25px 4px'},
children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
)
def NamedInput(name, **kwargs):
return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
# Utils
def DropdownOptionsList(*args):
return [{'label': val.capitalize(), 'value': val} for val in args]
asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
app = dash.Dash(__name__, assets_folder=asset_path)
server = app.server
# ###################### DATA PREPROCESSING ######################
# Load data
with open('sample_network.txt', 'r', encoding='utf-8') as f:
network_data = f.read().split('\n')
# We select the first 750 edges and associated nodes for an easier visualization
edges = network_data[:750]
nodes = set()
following_node_di = {} # user id -> list of users they are following
following_edges_di = {} # user id -> list of cy edges starting from user id
followers_node_di = {} # user id -> list of followers (cy_node format)
followers_edges_di = {} # user id -> list of cy edges ending at user id
cy_edges = []
cy_nodes = []
for edge in edges:
if ' ' not in edge:
continue
source, target = edge.split(' ')
cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
if source not in nodes:
nodes.add(source)
cy_nodes.append(cy_source)
if target not in nodes:
nodes.add(target)
cy_nodes.append(cy_target)
# Process dictionary of following
if not following_node_di.get(source):
following_node_di[source] = []
if not following_edges_di.get(source):
following_edges_di[source] = []
following_node_di[source].append(cy_target)
following_edges_di[source].append(cy_edge)
# Process dictionary of followers
if not followers_node_di.get(target):
followers_node_di[target] = []
if not followers_edges_di.get(target):
followers_edges_di[target] = []
followers_node_di[target].append(cy_source)
followers_edges_di[target].append(cy_edge)
genesis_node = cy_nodes[0]
genesis_node['classes'] = 'genesis'
default_elements = [genesis_node]
default_stylesheet = [
{'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
{
'selector': 'edge',
'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
},
{'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
{
'selector': '.followerEdge',
'style': {
'mid-target-arrow-color': 'blue',
'mid-target-arrow-shape': 'vee',
'line-color': '#0074D9',
},
},
{'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
{
'selector': '.followingEdge',
'style': {
'mid-target-arrow-color': 'red',
'mid-target-arrow-shape': 'vee',
'line-color': '#FF4136',
},
},
{
'selector': '.genesis',
'style': {
'background-color': '#B10DC9',
'border-width': 2,
'border-color': 'purple',
'border-opacity': 1,
'opacity': 1,
'label': 'data(label)',
'color': '#B10DC9',
'text-opacity': 1,
'font-size': 12,
'z-index': 9999,
},
},
{
'selector': ':selected',
'style': {
'border-width': 2,
'border-color': 'black',
'border-opacity': 1,
'opacity': 1,
'label': 'data(label)',
'color': 'black',
'font-size': 12,
'z-index': 9999,
},
},
]
# ################################# APP LAYOUT ################################
styles = {
'json-output': {
'overflow-y': 'scroll',
'height': 'calc(50% - 25px)',
'border': 'thin lightgrey solid',
},
'tab': {'height': 'calc(98vh - 80px)'},
}
app.layout = html.Div(
[
html.Div(
className='eight columns',
children=[
cyto.Cytoscape(
id='cytoscape',
elements=default_elements,
stylesheet=default_stylesheet,
style={'height': '95vh', 'width': '100%'},
)
],
),
html.Div(
className='four columns',
children=[
dcc.Tabs(
id='tabs',
children=[
dcc.Tab(
label='Control Panel',
children=[
NamedDropdown(
name='Layout',
id='dropdown-layout',
options=DropdownOptionsList(
'random',
'grid',
'circle',
'concentric',
'breadthfirst',
'cose',
'cose-bilkent',
'dagre',
'cola',
'klay',
'spread',
'euler',
),
value='grid',
clearable=False,
),
NamedRadioItems(
name='Expand',
id='radio-expand',
options=DropdownOptionsList('followers', 'following'),
value='followers',
),
],
),
dcc.Tab(
label='JSON',
children=[
html.Div(
style=styles['tab'],
children=[
html.P('Node Object JSON:'),
html.Pre(
id='tap-node-json-output',
style=styles['json-output'],
),
html.P('Edge Object JSON:'),
html.Pre(
id='tap-edge-json-output',
style=styles['json-output'],
),
],
)
],
),
],
),
],
),
]
)
# ############################## CALLBACKS ####################################
@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
def display_tap_node(data):
return json.dumps(data, indent=2)
@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
def display_tap_edge(data):
return json.dumps(data, indent=2)
@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
def update_cytoscape_layout(layout):
return {'name': layout}
@callback(
Output('cytoscape', 'elements'),
Input('cytoscape', 'tapNodeData'),
State('cytoscape', 'elements'),
State('radio-expand', 'value'),
)
def generate_elements(nodeData, elements, expansion_mode):
if not nodeData:
return default_elements
# If the node has already been expanded, we don't expand it again
if nodeData.get('expanded'):
return elements
# This retrieves the currently selected element, and tag it as expanded
for element in elements:
if nodeData['id'] == element.get('data').get('id'):
element['data']['expanded'] = True
break
if expansion_mode == 'followers':
followers_nodes = followers_node_di.get(nodeData['id'])
followers_edges = followers_edges_di.get(nodeData['id'])
if followers_nodes:
for node in followers_nodes:
node['classes'] = 'followerNode'
elements.extend(followers_nodes)
if followers_edges:
for follower_edge in followers_edges:
follower_edge['classes'] = 'followerEdge'
elements.extend(followers_edges)
elif expansion_mode == 'following':
following_nodes = following_node_di.get(nodeData['id'])
following_edges = following_edges_di.get(nodeData['id'])
if following_nodes:
for node in following_nodes:
if node['data']['id'] != genesis_node['data']['id']:
node['classes'] = 'followingNode'
elements.append(node)
if following_edges:
for follower_edge in following_edges:
follower_edge['classes'] = 'followingEdge'
elements.extend(following_edges)
return elements
if __name__ == '__main__':
app.run_server(debug=True)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,507 @@
import time
import webbrowser
from pathlib import Path
from threading import Thread
from typing import cast
import dash_cytoscape as cyto
import pandas as pd
import plotly.express as px
from dash import (
Dash,
Input,
Output,
State,
callback,
dash_table,
dcc,
html,
)
from pandas import DataFrame
from lang_main.analysis import graphs
from lang_main.io import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from lang_main.analysis import tokens
from lang_main.constants import SPCY_MODEL
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
# p_df = Path(r'../Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
# p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
ret = cast(tuple[DataFrame], load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
cands = ret[0]
texts = ret[1]
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
# data = cast(DataFrame, load_pickle(p_df))
# cands = cast(TimelineCandidates, load_pickle(p_cands))
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
table_feats = [
'ErstellungsDatum',
'ErledigungsDatum',
'VorgangsTypName',
'VorgangsBeschreibung',
]
table_feats_dates = [
'ErstellungsDatum',
'ErledigungsDatum',
]
# ** figure config
markers = {
'size': 12,
'color': 'yellow',
'line': {
'width': 2,
'color': 'red',
},
}
hover_data = {
'ErstellungsDatum': '|%d.%m.%Y',
'VorgangsBeschreibung': True,
}
# ** graphs
target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
p = Path(target).resolve()
ret = load_pickle(p)
tk_graph = cast(graphs.TokenGraph, ret[0])
tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, 150, None)
tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
# tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None)
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
cyto_data_base, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
MIN_WEIGHT = weight_data['min']
MAX_WEIGHT = weight_data['max']
cyto.load_extra_layouts()
cose_layout = {
'name': 'cose',
'nodeOverlap': 500,
'refresh': 20,
'fit': True,
'padding': 20,
'randomize': False,
'componentSpacing': 1.2,
'nodeRepulsion': 1000,
'edgeElasticity': 1000,
'idealEdgeLength': 100,
'nestingFactor': 1.2,
'gravity': 50,
'numIter': 3000,
'initialTemp': 2000,
'coolingFactor': 0.7,
'minTemp': 1.0,
'nodeDimensionsIncludeLabels': True,
}
my_stylesheet = [
# Group selectors
{
'selector': 'node',
'style': {
'shape': 'circle',
'content': 'data(label)',
'background-color': '#B10DC9',
'border-width': 2,
'border-color': 'black',
'border-opacity': 1,
'opacity': 1,
'color': 'black',
'text-opacity': 1,
'font-size': 12,
'z-index': 9999,
},
},
{
'selector': 'edge',
'style': {
#'width': f'mapData(weight, {MIN_WEIGHT}, {MAX_WEIGHT}, 1, 10)',
# 'width': """function(ele) {
# return ele.data('weight');
# """,
'curve-style': 'bezier',
'line-color': 'grey',
'line-style': 'solid',
'line-opacity': 1,
},
},
# Class selectors
# {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
]
# ** app
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = Dash(__name__, external_stylesheets=external_stylesheets)
graph_layout = html.Div(
[
html.Button('Trigger JS Weight', id='test_js_weight'),
html.Button('Trigger Candidate Graph', id='cand_graph'),
html.Div(id='output'),
html.Div(
[
html.H2('Token Graph', style={'margin': 0}),
html.Button(
'Reset Default',
id='bt-reset',
style={
'marginLeft': 'auto',
'width': '300px',
},
),
],
style={
'display': 'flex',
'marginBottom': '1em',
},
),
html.H3('Layout'),
dcc.Dropdown(
id='layout_choice',
options=[
'cose',
'cola',
'euler',
'random',
],
value='cose',
clearable=False,
),
html.Div(
[
html.H3('Graph Filter'),
dcc.Input(
id='weight_min',
type='number',
min=MIN_WEIGHT,
max=MAX_WEIGHT,
step=1,
placeholder=f'Minimum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
debounce=True,
style={'width': '40%'},
),
dcc.Input(
id='weight_max',
type='number',
min=MIN_WEIGHT,
max=MAX_WEIGHT,
step=1,
placeholder=f'Maximum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
debounce=True,
style={'width': '40%'},
),
html.H3('Graph'),
html.Button('Re-Layout', id='trigger_relayout'),
html.Div(
[
cyto.Cytoscape(
id='cytoscape-graph',
style={'width': '100%', 'height': '600px'},
layout=cose_layout,
stylesheet=my_stylesheet,
elements=cyto_data_base,
zoom=1,
),
],
style={
'border': '3px solid black',
'borderRadius': '25px',
'marginTop': '1em',
'marginBottom': '2em',
'padding': '7px',
},
),
],
style={'marginTop': '1em'},
),
],
)
app.layout = html.Div(
[
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
html.Div(
children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder='ObjektID auswählen...',
),
]
),
html.Div(
children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]
),
html.Div(
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
),
graph_layout,
],
style={'margin': '2em'},
)
@callback(
Output('object_text', 'children'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_obj_text(obj_id):
obj_id = int(obj_id)
obj_text = texts[obj_id]
headline = f'HObjektText: {obj_text}'
return headline
@callback(
Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_choice_candidates(obj_id):
obj_id = int(obj_id)
cands_obj_id = cands[obj_id]
choices = list(range(1, len(cands_obj_id) + 1))
return choices
@callback(
Output('graph-output', 'figure'),
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_timeline(index, obj_id):
obj_id = int(obj_id)
# title
obj_text = texts[obj_id]
title = f'HObjektText: {obj_text}'
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index() # type: ignore
# figure
fig = px.line(
data_frame=df,
x='ErstellungsDatum',
y='ObjektID',
title=title,
hover_data=hover_data,
)
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
fig.update_xaxes(
tickformat='%B\n%Y',
rangeslider_visible=True,
)
fig.update_yaxes(type='category')
fig.update_layout(hovermode='x unified')
return fig
@callback(
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_table_candidates(index, obj_id):
# obj_id = int(obj_id)
# # cands
# cands_obj_id = cands[obj_id]
# cands_choice = cands_obj_id[int(index) - 1]
# # data
# df = data.loc[list(cands_choice)].sort_index() # type: ignore
df = pre_filter_data(data, idx=index, obj_id=obj_id)
df = df.filter(items=table_feats, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
)
cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings
for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records')
return table_data, cols
def pre_filter_data(
data: DataFrame,
idx: int,
obj_id: ObjectID,
) -> DataFrame:
obj_id = int(obj_id)
data = data.copy()
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(idx) - 1]
# data
data = data.loc[list(cands_choice)].sort_index() # type: ignore
return data
# ** graph
@app.callback(
Output('cytoscape-graph', 'elements', allow_duplicate=True),
Output('weight_min', 'min', allow_duplicate=True),
Output('weight_min', 'max', allow_duplicate=True),
Output('weight_min', 'placeholder', allow_duplicate=True),
Output('weight_max', 'min', allow_duplicate=True),
Output('weight_max', 'max', allow_duplicate=True),
Output('weight_max', 'placeholder', allow_duplicate=True),
Input('cand_graph', 'n_clicks'),
State('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_graph_candidates(_, index, obj_id):
df = pre_filter_data(data, idx=index, obj_id=obj_id)
tk_graph_cands, _ = tokens.build_token_graph(
data=df,
model=SPCY_MODEL,
target_feature='VorgangsBeschreibung',
build_map=False,
)
cyto_data, weight_info = graphs.convert_graph_to_cytoscape(tk_graph_cands)
weight_min = weight_info['min']
weight_max = weight_info['max']
placeholder_min = f'Minimum edge weight: {weight_min} - {weight_max}'
placeholder_max = f'Minimum edge weight: {weight_min} - {weight_max}'
return (
cyto_data,
weight_min,
weight_max,
placeholder_min,
weight_min,
weight_max,
placeholder_max,
)
@app.callback(
Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('layout_choice', 'value'),
prevent_initial_call=True,
)
def update_layout_internal(layout_choice):
# return {'name': layout_choice}
return cose_layout
# return cose_bilkent_layout
# return cola_layout
@app.callback(
Output('cytoscape-graph', 'zoom'),
Output('cytoscape-graph', 'elements', allow_duplicate=True),
Output('weight_min', 'value'),
Output('weight_max', 'value'),
Input('bt-reset', 'n_clicks'),
prevent_initial_call=True,
)
def reset_layout(n_clicks):
return (1, cyto_data_base, None, None)
# update edge weight
@app.callback(
Output('cytoscape-graph', 'elements', allow_duplicate=True),
Input('weight_min', 'value'),
Input('weight_max', 'value'),
prevent_initial_call=True,
)
def update_edge_weight(weight_min, weight_max):
if not any([weight_min, weight_max]):
return cyto_data_base
if weight_min is None:
weight_min = MIN_WEIGHT
if weight_max is None:
weight_max = MAX_WEIGHT
tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, weight_min, weight_max)
# tk_graph_filtered = tk_graph.filter_by_edge_weight(weight_min, weight_max)
tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
cyto_data, _ = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
return cyto_data
app.clientside_callback(
"""
function(n_clicks, layout) {
layout.edgeElasticity = function(edge) {
return edge.data().weight * 0.05;
};
layout.idealEdgeLength = function(edge) {
return edge.data().weight * 0.4;
};
cy.layout(layout).run();
return layout;
}
""",
Output('cytoscape-graph', 'layout', allow_duplicate=True),
Input('trigger_relayout', 'n_clicks'),
State('cytoscape-graph', 'layout'),
prevent_initial_call=True,
)
app.clientside_callback(
"""
function(n_clicks, stylesheet) {
function edge_weight(ele) {
let threshold = 1000;
let weight = ele.data('weight');
if (weight > threshold) {
weight = 12;
} else {
weight = weight / threshold * 10;
weight = Math.max(1, weight);
}
return weight;
}
stylesheet[1].style.width = edge_weight;
cy.style(stylesheet).update();
return stylesheet;
}
""",
Output('cytoscape-graph', 'stylesheet'),
Input('test_js_weight', 'n_clicks'),
State('cytoscape-graph', 'stylesheet'),
prevent_initial_call=False,
)
def _start_webbrowser():
host = '127.0.0.1'
port = '8050'
adress = f'http://{host}:{port}/'
time.sleep(2)
webbrowser.open_new(adress)
def main():
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
webbrowser_thread.start()
app.run(debug=True)
if __name__ == '__main__':
main()

View File

@ -1,38 +0,0 @@
# lang_main: Config file
[paths]
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = true
graph_postprocessing = false
graph_postprocessing_skip = true
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis]
threshold_unique_texts = 5

View File

@ -2,22 +2,20 @@
[paths] [paths]
inputs = './inputs/' inputs = './inputs/'
results = './results/test_20240529/' results = './results/test_20240619/'
dataset = '../data/02_202307/Export4.csv' dataset = '../data/02_202307/Export4.csv'
#results = './results/Export7/' #results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/' #results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control] [control]
preprocessing = false preprocessing_skip = true
preprocessing_skip = false token_analysis_skip = true
token_analysis = true
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = true graph_postprocessing_skip = true
time_analysis = false time_analysis_skip = false
time_analysis_skip = true
#[export_filenames] #[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import copy import copy
import sys import sys
import typing import typing
@ -169,6 +171,90 @@ def convert_graph_to_cytoscape(
return cyto_data, weight_metadata return cyto_data, weight_metadata
def filter_graph_by_edge_weight(
graph: TokenGraph,
bound_lower: int | None,
bound_upper: int | None,
) -> TokenGraph:
"""filters all edges which are within the provided bounds
Parameters
----------
bound_lower : int | None
lower bound for edge weights, edges with weight equal to this value are retained
bound_upper : int | None
upper bound for edge weights, edges with weight equal to this value are retained
Returns
-------
TokenGraph
a copy of the graph with filtered edges
"""
original_graph_edges = copy.deepcopy(graph.edges)
filtered_graph = graph.copy()
if not any([bound_lower, bound_upper]):
logger.warning('No bounds provided, returning original graph.')
return filtered_graph
for edge in original_graph_edges:
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
if bound_lower is not None and weight < bound_lower:
filtered_graph.remove_edge(edge[0], edge[1])
if bound_upper is not None and weight > bound_upper:
filtered_graph.remove_edge(edge[0], edge[1])
if filtered_graph._undirected is not None:
filtered_graph.to_undirected(inplace=True, logging=False)
filtered_graph.update_metadata(logging=False)
return filtered_graph
def filter_graph_by_node_degree(
graph: TokenGraph,
bound_lower: int | None,
bound_upper: int | None,
) -> TokenGraph:
"""filters all nodes which are within the provided bounds by their degree
Parameters
----------
bound_lower : int | None
lower bound for node degree, nodes with degree equal to this value are retained
bound_upper : int | None
upper bound for node degree, nodes with degree equal to this value are retained
Returns
-------
TokenGraph
a copy of the graph with filtered nodes
"""
# filter nodes by degree
original_graph_nodes = copy.deepcopy(graph.nodes)
filtered_graph = graph.copy()
if not any([bound_lower, bound_upper]):
logger.warning('No bounds provided, returning original graph.')
return filtered_graph
for node in original_graph_nodes:
degree = filtered_graph.degree[node] # type: ignore
if bound_lower is not None and degree < bound_lower:
filtered_graph.remove_node(node)
if bound_upper is not None and degree > bound_upper:
filtered_graph.remove_node(node)
if filtered_graph._undirected is not None:
filtered_graph.to_undirected(inplace=True, logging=False)
filtered_graph.update_metadata(logging=False)
return filtered_graph
# ** ---------------------------------------
class TokenGraph(DiGraph): class TokenGraph(DiGraph):
def __init__( def __init__(
self, self,
@ -286,87 +372,6 @@ class TokenGraph(DiGraph):
graph=self._undirected, logging=logging graph=self._undirected, logging=logging
) )
def filter_by_edge_weight(
self,
bound_lower: int | None,
bound_upper: int | None,
) -> Self:
"""filters all edges which are within the provided bounds
Parameters
----------
bound_lower : int | None
lower bound for edge weights, edges with weight equal to this value are retained
bound_upper : int | None
upper bound for edge weights, edges with weight equal to this value are retained
Returns
-------
Self
a copy of the graph with filtered edges
"""
original_graph_edges = copy.deepcopy(self.edges)
filtered_graph = self.copy()
if not any([bound_lower, bound_upper]):
logger.warning('No bounds provided, returning original graph.')
return filtered_graph
for edge in original_graph_edges:
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
if bound_lower is not None and weight < bound_lower:
filtered_graph.remove_edge(edge[0], edge[1])
if bound_upper is not None and weight > bound_upper:
filtered_graph.remove_edge(edge[0], edge[1])
if filtered_graph._undirected is not None:
filtered_graph.to_undirected(inplace=True, logging=False)
filtered_graph.update_metadata(logging=False)
return filtered_graph
def filter_by_node_degree(
self,
bound_lower: int | None,
bound_upper: int | None,
) -> Self:
"""filters all nodes which are within the provided bounds by their degree
Parameters
----------
bound_lower : int | None
lower bound for node degree, nodes with degree equal to this value are retained
bound_upper : int | None
upper bound for node degree, nodes with degree equal to this value are retained
Returns
-------
Self
a copy of the graph with filtered nodes
"""
# filter nodes by degree
original_graph_nodes = copy.deepcopy(self.nodes)
filtered_graph = self.copy()
if not any([bound_lower, bound_upper]):
logger.warning('No bounds provided, returning original graph.')
return filtered_graph
for node in original_graph_nodes:
degree = filtered_graph.degree[node] # type: ignore
if bound_lower is not None and degree < bound_lower:
filtered_graph.remove_node(node)
if bound_upper is not None and degree > bound_upper:
filtered_graph.remove_node(node)
if filtered_graph._undirected is not None:
filtered_graph.to_undirected(inplace=True, logging=False)
filtered_graph.update_metadata(logging=False)
return filtered_graph
def _save_prepare( def _save_prepare(
self, self,
path: Path, path: Path,
@ -379,14 +384,13 @@ class TokenGraph(DiGraph):
return saving_path return saving_path
def save_graph( def to_GraphML(
self, self,
path: Path, path: Path,
filename: str | None = None, filename: str | None = None,
directed: bool = False, directed: bool = False,
) -> None: ) -> None:
"""save one of the stored graphs to disk file, """save one of the stored graphs to GraphML format on disk,
currently only GraphML format is supported
Parameters Parameters
---------- ----------

View File

@ -22,7 +22,7 @@ from lang_main.analysis.shared import (
similar_index_groups, similar_index_groups,
) )
from lang_main.loggers import logger_preprocess as logger from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline from lang_main.pipelines.base import Pipeline
from lang_main.types import Embedding, PandasIndex from lang_main.types import Embedding, PandasIndex
# ** RE patterns # ** RE patterns
@ -119,10 +119,9 @@ def remove_duplicates(
).copy() ).copy()
logger.info('Removed all duplicates from dataset successfully.') logger.info('Removed all duplicates from dataset successfully.')
logger.info( logger.info(
( 'New Dataset properties: number of entries: %d, number of features %d',
f'New Dataset properties: number of entries: {len(wo_duplicates)}, ' len(wo_duplicates),
f'number of features {len(wo_duplicates.columns)}' len(wo_duplicates.columns),
)
) )
return (wo_duplicates,) return (wo_duplicates,)
@ -176,6 +175,7 @@ def clean_string_slim(string: str) -> str:
string = pattern_special_chars.sub(' ', string) string = pattern_special_chars.sub(' ', string)
string = pattern_repeated_chars.sub(r'\1', string) string = pattern_repeated_chars.sub(r'\1', string)
# string = pattern_dates.sub('', string) # string = pattern_dates.sub('', string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(' ', string) string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end # remove whitespaces at the beginning and the end
string = string.strip() string = string.strip()
@ -241,11 +241,84 @@ def analyse_feature(
return (result_df,) return (result_df,)
# ** pre-filter
def numeric_pre_filter_feature(
data: DataFrame,
feature: str,
bound_lower: int | None,
bound_upper: int | None,
) -> tuple[DataFrame]:
if not any([bound_lower, bound_upper]):
raise ValueError('No bounds for filtering provided')
data = data.copy()
if bound_lower is None:
bound_lower = cast(int, data[feature].min())
if bound_upper is None:
bound_upper = cast(int, data[feature].max())
filter_lower = data[feature] >= bound_lower
filter_upper = data[feature] <= bound_upper
filter = filter_lower & filter_upper
data = data.loc[filter]
return (data,)
# ** embedding based similarity # ** embedding based similarity
# following functions used to identify similar entries to have # following functions used to identify similar entries to have
# a more robust identification of duplicates negating negative side effects # a more robust identification of duplicates negating negative side effects
# of several disturbances like typos, escape characters, etc. # of several disturbances like typos, escape characters, etc.
# build mapping of embeddings for given model # build mapping of embeddings for given model
def merge_similarity_dupl(
data: DataFrame,
model: SentenceTransformer,
cos_sim_threshold: float,
) -> tuple[DataFrame]:
logger.info('Start merging of similarity candidates...')
# data
merged_data = data.copy()
model_input = merged_data['entry']
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
by=['num_occur', 'num_assoc_obj_ids', 'len'],
ascending=[False, False, False],
)
# merge information to first entry
data_idx = cast(PandasIndex, similar_data.index[0])
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
assoc_obj_ids = np.concatenate(assoc_obj_ids)
assoc_obj_ids = np.unique(assoc_obj_ids)
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
# remaining indices, should be removed
similar_id_group.remove(data_idx)
merged_similar_data = similar_data.drop(index=similar_id_group)
# update entry in main dataset, drop remaining entries
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info('Similarity candidates merged successfully.')
return (merged_data,)
#####################################################################
def build_embedding_map( def build_embedding_map(
data: Series, data: Series,
model: GermanSpacyModel | SentenceTransformer, model: GermanSpacyModel | SentenceTransformer,
@ -373,7 +446,7 @@ def list_cosSim_dupl_candidates(
save_candidates: bool = False, save_candidates: bool = False,
saving_path: Path | None = None, saving_path: Path | None = None,
filename: str = 'CosSim-FilterCandidates', filename: str = 'CosSim-FilterCandidates',
pipeline: BasePipeline | None = None, pipeline: Pipeline | None = None,
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]: ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
"""providing an overview of candidates with a similarity score greater than """providing an overview of candidates with a similarity score greater than
given threshold; more suitable for debugging purposes given threshold; more suitable for debugging purposes
@ -465,53 +538,6 @@ def similar_ids_groups(
""" """
def merge_similarity_dupl(
data: DataFrame,
model: SentenceTransformer,
cos_sim_threshold: float,
) -> tuple[DataFrame]:
logger.info('Start merging of similarity candidates...')
# data
merged_data = data.copy()
model_input = merged_data['entry']
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
by=['num_occur', 'num_assoc_obj_ids', 'len'],
ascending=[False, False, False],
)
# merge information to first entry
data_idx = cast(PandasIndex, similar_data.index[0])
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
assoc_obj_ids = np.concatenate(assoc_obj_ids)
assoc_obj_ids = np.unique(assoc_obj_ids)
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
# remaining indices, should be removed
similar_id_group.remove(data_idx)
merged_similar_data = similar_data.drop(index=similar_id_group)
# update entry in main dataset, drop remaining entries
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info('Similarity candidates merged successfully.')
return (merged_data.copy(),)
# merge duplicates # merge duplicates
def merge_similarity_dupl_old( def merge_similarity_dupl_old(
data: DataFrame, data: DataFrame,

View File

@ -24,13 +24,13 @@ PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
# if not PATH_TO_DATASET.exists(): # if not PATH_TO_DATASET.exists():
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.') # raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
# ** control # ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing'] # DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip'] SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis'] # DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip'] SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing'] # DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip'] SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis'] # DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip'] SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** models # ** models
@ -66,11 +66,11 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
] ]
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id'] FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
# ** time_analysis.model_input # ** time_analysis.model_input
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple( MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
CONFIG['time_analysis']['model_input']['input_features'] CONFIG['time_analysis']['model_input']['input_features']
) )
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature'] ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
ACTIVITY_TYPES: Final[tuple[str]] = tuple( ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
CONFIG['time_analysis']['model_input']['activity_types'] CONFIG['time_analysis']['model_input']['activity_types']
) )
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][ THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][

View File

@ -1,4 +1,3 @@
import os
import pickle import pickle
import shutil import shutil
import tomllib import tomllib
@ -22,7 +21,7 @@ def create_saving_folder(
if overwrite_existing: if overwrite_existing:
# overwrite if desired (deletes whole path and re-creates it) # overwrite if desired (deletes whole path and re-creates it)
shutil.rmtree(saving_path_folder) shutil.rmtree(saving_path_folder)
os.makedirs(saving_path_folder) saving_path_folder.mkdir(parents=True)
else: else:
logger.info( logger.info(
( (
@ -62,56 +61,14 @@ def load_pickle(
return obj return obj
# TODO: remove, too specialised for common application def get_entry_point(
""" saving_path: Path,
def filter_candidates_idx( filename: str,
data_model_input: Series, ) -> Path:
model: SentenceTransformer, entry_point_path = (saving_path / filename).with_suffix('.pkl')
cos_sim_threshold: float, if not entry_point_path.exists():
) -> Iterator[tuple[PandasIndex, PandasIndex]]: raise FileNotFoundError(
common function to filter candidate indices based on cosine similarity f'Could not find provided entry data under path: >>{entry_point_path}<<'
using SentenceTransformer model in batch mode,
feed of data as Series to retain information about indices of entries
Parameters
----------
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float
threshold for cosine similarity to filter candidates
Yields
------
Iterator[tuple[PandasIndex, PandasIndex]]
index pairs which meet the cosine similarity threshold
# embeddings
batch = typing.cast(list[str],
data_model_input.to_list())
embds = typing.cast(Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
))
# cosine similarity
cos_sim = typing.cast(
npt.NDArray,
sentence_transformers.util.cos_sim(embds, embds).numpy()
)
np.fill_diagonal(cos_sim, 0.)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = typing.cast(
tuple[np.int64, np.int64],
tuple(data_model_input.index[idx] for idx in idx_array)
) )
yield idx_pair
""" return entry_point_path

View File

@ -9,14 +9,12 @@ dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7_trunc/' #results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control] [control]
preprocessing = true
preprocessing_skip = false preprocessing_skip = false
token_analysis = false
token_analysis_skip = false token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false time_analysis_skip = false
#[export_filenames] #[export_filenames]
@ -42,9 +40,12 @@ criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID' feature_name_obj_id = 'ObjektID'
[time_analysis.model_input] [time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [ input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung', 'VorgangsBeschreibung',
] ]
activity_feature = 'VorgangsTypName' activity_feature = 'VorgangsTypName'

View File

@ -1,9 +1,14 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from collections.abc import Callable from collections.abc import Callable
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Never, cast
from typing_extensions import override
from lang_main.loggers import logger_pipelines as logger
from lang_main.io import load_pickle, save_pickle from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_pipelines as logger
from lang_main.types import ResultHandling
# ** pipelines to perform given actions on dataset in a customisable manner # ** pipelines to perform given actions on dataset in a customisable manner
@ -12,7 +17,18 @@ class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline""" """Error describing that no action is available in the current pipeline"""
class BasePipeline: class WrongActionTypeError(Exception):
"""Error raised if added action type is not supported by corresponding pipeline"""
class OutputInPipelineContainerError(Exception):
"""Error raised if an output was detected by one of the performed
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
procedure which does not have any parameters or return values and should therefore not
return any values."""
class BasePipeline(ABC):
def __init__( def __init__(
self, self,
name: str, name: str,
@ -25,18 +41,12 @@ class BasePipeline:
self.name = name self.name = name
# working directory for pipeline == output path # working directory for pipeline == output path
self.working_dir = working_dir self.working_dir = working_dir
# if not self.working_dir.exists():
# self.working_dir.mkdir(parents=True)
# container for actions to perform during pass # container for actions to perform during pass
self.actions: list[Callable] = [] self.actions: list[Callable] = []
self.action_names: list[str] = [] self.action_names: list[str] = []
self.actions_kwargs: list[dict[str, Any]] = []
self.is_save_result: list[bool] = []
# progress tracking, start at 1 # progress tracking, start at 1
self.curr_proc_idx: int = 1 self.curr_proc_idx: int = 1
# intermediate result
self._intermediate_result: Any | None = None
def __repr__(self) -> str: def __repr__(self) -> str:
return ( return (
@ -44,15 +54,132 @@ class BasePipeline:
f'working dir: {self.working_dir}, contents: {self.action_names})' f'working dir: {self.working_dir}, contents: {self.action_names})'
) )
@property def panic_wrong_action_type(
def intermediate_result(self) -> Any: self,
return self._intermediate_result action: Any,
compatible_type: str,
) -> Never:
raise WrongActionTypeError(
(
f'Action must be of type {compatible_type}, '
f'but is of type >>{type(action)}<<.'
)
)
def prep_run(self) -> None:
logger.info('Starting pipeline >>%s<<...', self.name)
# progress tracking
self.curr_proc_idx = 1
# check if performable actions available
if len(self.actions) == 0:
raise NoPerformableActionError(
'The pipeline does not contain any performable actions.'
)
def post_run(self) -> None:
logger.info(
'Processing pipeline >>%s<< successfully ended after %d steps.',
self.name,
(self.curr_proc_idx - 1),
)
@abstractmethod
def add(self) -> None: ...
@abstractmethod
def logic(self) -> None: ...
def run(self, *args, **kwargs) -> Any:
self.prep_run()
ret = self.logic(*args, **kwargs)
self.post_run()
return ret
class PipelineContainer(BasePipeline):
def __init__(
self,
name: str,
working_dir: Path,
) -> None:
super().__init__(name=name, working_dir=working_dir)
self.action_skip: list[bool] = []
@override
def add(
self,
action: Callable,
skip: bool = False,
) -> None:
if isinstance(action, Callable):
self.actions.append(action)
self.action_names.append(action.__name__)
self.action_skip.append(skip)
else:
self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
@override
def logic(self) -> None:
for idx, (action, action_name) in enumerate(zip(self.actions, self.action_names)):
# loading
if self.action_skip[idx]:
logger.info('[No Calculation] Skipping >>%s<<...', action_name)
self.curr_proc_idx += 1
continue
# calculation
ret = action()
if ret is not None:
raise OutputInPipelineContainerError(
(
f'Output in PipelineContainers not allowed. Action {action_name} '
f'returned values in Container {self.name}.'
)
)
# processing tracking
self.curr_proc_idx += 1
class Pipeline(BasePipeline):
def __init__(
self,
name: str,
working_dir: Path,
) -> None:
# init base class
super().__init__(name=name, working_dir=working_dir)
# name of pipeline
self.name = name
# working directory for pipeline == output path
self.working_dir = working_dir
# if not self.working_dir.exists():
# self.working_dir.mkdir(parents=True)
# container for actions to perform during pass
self.actions_kwargs: list[dict[str, Any]] = []
self.save_results: ResultHandling = []
self.load_results: ResultHandling = []
# intermediate result
self._intermediate_result: tuple[Any, ...] | None = None
def __repr__(self) -> str:
return (
f'{self.__class__.__name__}(name: {self.name}, '
f'working dir: {self.working_dir}, contents: {self.action_names})'
)
# @property
# def intermediate_result(self) -> tuple[Any, ...] | None:
# return self._intermediate_result
@override
def add( def add(
self, self,
action: Callable, action: Callable,
action_kwargs: dict[str, Any] = {}, action_kwargs: dict[str, Any] = {},
save_result: bool = False, save_result: bool = False,
load_result: bool = False,
filename: str | None = None,
) -> None: ) -> None:
# check explicitly for function type # check explicitly for function type
# if isinstance(action, FunctionType): # if isinstance(action, FunctionType):
@ -60,11 +187,10 @@ class BasePipeline:
self.actions.append(action) self.actions.append(action)
self.action_names.append(action.__name__) self.action_names.append(action.__name__)
self.actions_kwargs.append(action_kwargs.copy()) self.actions_kwargs.append(action_kwargs.copy())
self.is_save_result.append(save_result) self.save_results.append((save_result, filename))
self.load_results.append((load_result, filename))
else: else:
raise TypeError( self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
f'Action must be custom function, but is of type >>{type(action)}<<.'
)
# TODO: add multiple entries by utilising simple add method # TODO: add multiple entries by utilising simple add method
""" """
@ -88,57 +214,84 @@ class BasePipeline:
f"but is of type >>{type(action)}<<.")) f"but is of type >>{type(action)}<<."))
""" """
def save_curr_result( def get_result_path(
self, self,
filename: str, action_idx: int,
filename: str | None,
) -> tuple[Path, str]:
action_name = self.action_names[action_idx]
if filename is None:
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_{action_name}'
else:
target_filename = filename
target_path = self.working_dir.joinpath(target_filename).with_suffix('.pkl')
return target_path, action_name
def load_step(
self,
action_idx: int,
filename: str | None,
) -> tuple[Any, ...]:
target_path, action_name = self.get_result_path(action_idx, filename)
if not target_path.exists():
raise FileNotFoundError(
(
f'No intermediate results for action >>{action_name}<< '
f'under >>{target_path}<< found'
)
)
# results should be tuple, but that is not guaranteed
result_loaded = cast(tuple[Any, ...], load_pickle(target_path))
if not isinstance(result_loaded, tuple):
raise TypeError(f'Loaded results must be tuple, not {type(result_loaded)}')
return result_loaded
def save_step(
self,
action_idx: int,
filename: str | None,
) -> None: ) -> None:
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename # target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
target_path = self.working_dir.joinpath(target_filename) # target_path = self.working_dir.joinpath(target_filename)
target_path = target_path.with_suffix('.pkl') # target_path = target_path.with_suffix('.pkl')
target_path, _ = self.get_result_path(action_idx, filename)
# saving file locally # saving file locally
save_pickle(obj=self._intermediate_result, path=target_path) save_pickle(obj=self._intermediate_result, path=target_path)
def load_intermediate_result( @override
self, def logic(
saving_path: str,
filename: str,
) -> tuple[Any, ...]:
target_path = Path(saving_path + filename).with_suffix('.pkl')
# loading DataFrame or Series from pickle
data = load_pickle(target_path)
return data
def prep_run(self) -> None:
logger.info('Starting processing pipeline >>%s<<...', self.name)
# progress tracking
self.curr_proc_idx = 1
# check if performable actions available
if len(self.actions) == 0:
raise NoPerformableActionError(
'The pipeline does not contain any performable actions.'
)
def run(
self, self,
starting_values: tuple[Any, ...], starting_values: tuple[Any, ...],
) -> tuple[Any, ...]: ) -> tuple[Any, ...]:
# prepare start
self.prep_run()
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)): for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
# loading
if self.load_results[idx][0]:
filename = self.load_results[idx][1]
ret = self.load_step(action_idx=idx, filename=filename)
logger.info(
'[No Calculation] Loaded result for action >>%s<< successfully',
self.action_names[idx],
)
self.curr_proc_idx += 1
continue
# calculation
if idx == 0: if idx == 0:
ret = action(*starting_values, **action_kwargs) ret = action(*starting_values, **action_kwargs)
else: else:
ret = action(*ret, **action_kwargs) ret = action(*ret, **action_kwargs)
if not isinstance(ret, tuple):
ret = (ret,)
ret = cast(tuple[Any, ...], ret)
# save intermediate result # save intermediate result
self._intermediate_result = ret self._intermediate_result = ret
# check if result should be saved locally # saving result locally, always save last action
if self.is_save_result[idx]: if self.save_results[idx][0] or idx == (len(self.actions) - 1):
self.save_curr_result(filename=self.action_names[idx]) filename = self.save_results[idx][1]
self.save_step(action_idx=idx, filename=filename)
# processing tracking # processing tracking
self.curr_proc_idx += 1 self.curr_proc_idx += 1
logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
return ret return ret

View File

@ -1,9 +1,11 @@
from lang_main.analysis import graphs
from lang_main.analysis.preprocessing import ( from lang_main.analysis.preprocessing import (
analyse_feature, analyse_feature,
clean_string_slim, clean_string_slim,
entry_wise_cleansing, entry_wise_cleansing,
load_raw_data, load_raw_data,
merge_similarity_dupl, merge_similarity_dupl,
numeric_pre_filter_feature,
remove_duplicates, remove_duplicates,
remove_NA, remove_NA,
) )
@ -23,40 +25,50 @@ from lang_main.constants import (
SAVE_PATH_FOLDER, SAVE_PATH_FOLDER,
SPCY_MODEL, SPCY_MODEL,
STFR_MODEL, STFR_MODEL,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
THRESHOLD_NUM_ACTIVITIES, THRESHOLD_NUM_ACTIVITIES,
THRESHOLD_SIMILARITY, THRESHOLD_SIMILARITY,
THRESHOLD_TIMELINE_SIMILARITY, THRESHOLD_TIMELINE_SIMILARITY,
THRESHOLD_UNIQUE_TEXTS, THRESHOLD_UNIQUE_TEXTS,
UNIQUE_CRITERION_FEATURE, UNIQUE_CRITERION_FEATURE,
) )
from lang_main.pipelines.base import BasePipeline from lang_main.pipelines.base import Pipeline
from lang_main.types import EntryPoints
# ** pipeline configuration # ** pipeline configuration
# ** target feature preparation # ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER) def build_base_target_feature_pipe() -> Pipeline:
pipe_target_feat.add( pipe_target_feat = Pipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
load_raw_data, pipe_target_feat.add(
{ load_raw_data,
'date_cols': DATE_COLS, {
}, 'date_cols': DATE_COLS,
) },
pipe_target_feat.add(remove_duplicates) )
pipe_target_feat.add(remove_NA, save_result=True) pipe_target_feat.add(remove_duplicates)
pipe_target_feat.add( pipe_target_feat.add(remove_NA, save_result=True)
entry_wise_cleansing, pipe_target_feat.add(
{ entry_wise_cleansing,
'target_feature': 'VorgangsBeschreibung', {
'cleansing_func': clean_string_slim, 'target_feature': 'VorgangsBeschreibung',
}, 'cleansing_func': clean_string_slim,
save_result=True, },
) save_result=True,
pipe_target_feat.add( filename=EntryPoints.TIMELINE,
analyse_feature, )
{ pipe_target_feat.add(
'target_feature': 'VorgangsBeschreibung', analyse_feature,
}, {
save_result=True, 'target_feature': 'VorgangsBeschreibung',
) },
save_result=True,
)
return pipe_target_feat
# output: DataFrame containing target feature with # output: DataFrame containing target feature with
# number of occurrences and associated ObjectIDs # number of occurrences and associated ObjectIDs
@ -81,68 +93,114 @@ pipe_target_feat.add(
# save_result=True, # save_result=True,
# ) # )
# ** Merge duplicates # ** Merge duplicates
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER) def build_merge_duplicates_pipe() -> Pipeline:
# pipe_merge.add(merge_similarity_dupl, save_result=True) pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
pipe_merge.add( # pipe_merge.add(merge_similarity_dupl, save_result=True)
merge_similarity_dupl, pipe_merge.add(
{ numeric_pre_filter_feature,
'model': STFR_MODEL, {
'cos_sim_threshold': THRESHOLD_SIMILARITY, 'feature': 'len',
}, 'bound_lower': THRESHOLD_AMOUNT_CHARACTERS,
save_result=True, 'bound_upper': None,
) },
)
pipe_merge.add(
merge_similarity_dupl,
{
'model': STFR_MODEL,
'cos_sim_threshold': THRESHOLD_SIMILARITY,
},
save_result=True,
filename=EntryPoints.TOKEN_ANALYSIS,
)
return pipe_merge
# ** token analysis # ** token analysis
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER) def build_tk_graph_pipe() -> Pipeline:
pipe_token_analysis.add( pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
build_token_graph, pipe_token_analysis.add(
{ build_token_graph,
'model': SPCY_MODEL, {
'target_feature': 'entry', 'model': SPCY_MODEL,
'weights_feature': 'num_occur', 'target_feature': 'entry',
'batch_idx_feature': 'batched_idxs', 'weights_feature': 'num_occur',
'build_map': True, 'batch_idx_feature': 'batched_idxs',
'batch_size_model': 50, 'build_map': False,
}, 'batch_size_model': 50,
save_result=True, },
) save_result=True,
filename=EntryPoints.TK_GRAPH_POST,
)
return pipe_token_analysis
def build_tk_graph_post_pipe() -> Pipeline:
pipe_graph_postprocessing = Pipeline(
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
)
pipe_graph_postprocessing.add(
graphs.filter_graph_by_edge_weight,
{
'bound_lower': THRESHOLD_EDGE_WEIGHT,
'bound_upper': None,
},
)
pipe_graph_postprocessing.add(
graphs.filter_graph_by_node_degree,
{
'bound_lower': 1,
'bound_upper': None,
},
save_result=True,
filename=EntryPoints.TK_GRAPH_ANALYSIS,
)
return pipe_graph_postprocessing
# ** timeline analysis # ** timeline analysis
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER) def build_timeline_pipe() -> Pipeline:
pipe_timeline.add( pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
remove_non_relevant_obj_ids, pipe_timeline.add(
{ remove_non_relevant_obj_ids,
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS, {
'feature_uniqueness': UNIQUE_CRITERION_FEATURE, 'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
'feature_obj_id': FEATURE_NAME_OBJ_ID, 'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
}, 'feature_obj_id': FEATURE_NAME_OBJ_ID,
save_result=True, },
) save_result=True,
pipe_timeline.add( )
generate_model_input, pipe_timeline.add(
{ generate_model_input,
'target_feature_name': 'nlp_model_input', {
'model_input_features': MODEL_INPUT_FEATURES, 'target_feature_name': 'nlp_model_input',
}, 'model_input_features': MODEL_INPUT_FEATURES,
) },
pipe_timeline.add( )
filter_activities_per_obj_id, pipe_timeline.add(
{ filter_activities_per_obj_id,
'activity_feature': ACTIVITY_FEATURE, {
'relevant_activity_types': ACTIVITY_TYPES, 'activity_feature': ACTIVITY_FEATURE,
'feature_obj_id': FEATURE_NAME_OBJ_ID, 'relevant_activity_types': ACTIVITY_TYPES,
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES, 'feature_obj_id': FEATURE_NAME_OBJ_ID,
}, 'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
) },
pipe_timeline.add( )
get_timeline_candidates, pipe_timeline.add(
{ get_timeline_candidates,
'model': STFR_MODEL, {
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY, 'model': STFR_MODEL,
'feature_obj_id': FEATURE_NAME_OBJ_ID, 'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
'model_input_feature': 'nlp_model_input', 'feature_obj_id': FEATURE_NAME_OBJ_ID,
}, 'model_input_feature': 'nlp_model_input',
save_result=True, },
) save_result=True,
filename=EntryPoints.TIMELINE_POST,
)
return pipe_timeline

View File

@ -17,8 +17,20 @@ class LoggingLevels(enum.IntEnum):
# ** devices # ** devices
class STFRDeviceTypes(enum.StrEnum): class STFRDeviceTypes(enum.StrEnum):
CPU = 'cpu' CPU = enum.auto()
GPU = 'cuda' GPU = enum.auto()
# ** pipelines
ResultHandling: TypeAlias = list[tuple[bool, str | None]]
class EntryPoints(enum.StrEnum):
TIMELINE = 'TIMELINE'
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
TOKEN_ANALYSIS = 'TOKEN_ANALYSIS'
# ** datasets # ** datasets

1687
test-notebooks/misc.ipynb Normal file

File diff suppressed because it is too large Load Diff