new pipeline management, proto graph display timeline

2024-06-19 16:58:26 +02:00 · 2024-06-19 16:58:26 +02:00 · fb4437a3a2
commit fb4437a3a2
parent c2714b8060
21 changed files with 2838 additions and 11383 deletions
--- a/pdm.lock
+++ b/pdm.lock
@ -5,7 +5,7 @@
 groups = ["default", "notebooks", "trials"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98"
+content_hash = "sha256:e00f157f833ee7615d96375c352e2caa6b4f6b50e5615ccbefa79446189594c7"
 [[package]]
 name = "annotated-types"
@ -2938,13 +2938,13 @@ files = [
 [[package]]
 name = "typing-extensions"
-version = "4.11.0"
+version = "4.12.2"
 requires_python = ">=3.8"
 summary = "Backported and Experimental Type Hints for Python 3.8+"
 groups = ["default", "notebooks", "trials"]
 files = [
-    {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
-    {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 [[package]]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ dependencies = [
    "sentence-transformers>=2.7.0",
    "numpy>=1.26.4",
    "pip>=24.0",
    "typing-extensions>=4.12.2",
 ]
 requires-python = ">=3.11"
 readme = "README.md"
@ -48,3 +49,6 @@ skip-magic-trailing-comma = false
 [tool.ruff.lint]
 select = ["E", "F", "I"]
 [tool.ruff.lint.isort]
 extra-standard-library = ["typing_extensions"]
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@ -1,42 +1,44 @@
 import typing
 import warnings
 from pathlib import Path
 from typing import cast
 from pandas import DataFrame, Series
 from lang_main.analysis.graphs import TokenGraph
 from lang_main.constants import (
    DO_GRAPH_POSTPROCESSING,
    DO_PREPROCESSING,
    DO_TIME_ANALYSIS,
    DO_TOKEN_ANALYSIS,
    INPUT_PATH_FOLDER,
    PATH_TO_DATASET,
    SAVE_PATH_FOLDER,
    SKIP_GRAPH_POSTPROCESSING,
    SKIP_PREPROCESSING,
    SKIP_TIME_ANALYSIS,
    SKIP_TOKEN_ANALYSIS,
    THRESHOLD_AMOUNT_CHARACTERS,
    THRESHOLD_EDGE_WEIGHT,
 )
-from lang_main.io import create_saving_folder, load_pickle
+from lang_main.io import create_saving_folder, get_entry_point, load_pickle
 from lang_main.pipelines.base import PipelineContainer
 from lang_main.pipelines.predefined import (
-    pipe_merge,
+    build_base_target_feature_pipe,
-    pipe_target_feat,
+    build_merge_duplicates_pipe,
-    pipe_timeline,
+    build_timeline_pipe,
-    pipe_token_analysis,
+    build_tk_graph_pipe,
    build_tk_graph_post_pipe,
 )
 from lang_main.types import (
    EntryPoints,
    ObjectID,
    PandasIndex,
    SpacyDoc,
    TimelineCandidates,
 )
-from pandas import DataFrame, Series
+
 # ** build pipelines
 pipe_merge = build_merge_duplicates_pipe()
 pipe_target_feat = build_base_target_feature_pipe()
 pipe_timeline = build_timeline_pipe()
 pipe_token_analysis = build_tk_graph_pipe()
 pipe_graph_postprocessing = build_tk_graph_post_pipe()
-# ** processing pipeline
+# ** preprocessing pipeline
-def run_preprocessing() -> DataFrame:
+def run_preprocessing() -> None:
    create_saving_folder(
        saving_path_folder=SAVE_PATH_FOLDER,
        overwrite_existing=False,
@ -46,134 +48,69 @@ def run_preprocessing() -> DataFrame:
        tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
    )
    target_feat_data = ret[0]
-    # only entries with more than threshold amount of characters
+    _ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
    subset_data = target_feat_data.loc[data_filter].copy()
    # merge duplicates, results saved separately
    ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
    preprocessed_data = ret[0]
    return preprocessed_data
-def run_token_analysis(
+# ** token analysis
-    preprocessed_data: DataFrame,
+def run_token_analysis() -> None:
-) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
+    # load entry point
    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TOKEN_ANALYSIS)
    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
    preprocessed_data = loaded_results[0]
    # build token graph
    (tk_graph, docs_mapping) = typing.cast(
-        tuple[TokenGraph, dict[PandasIndex, SpacyDoc]],
+        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
        pipe_token_analysis.run(starting_values=(preprocessed_data,)),
    )
-    tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
+    tk_graph.to_GraphML(SAVE_PATH_FOLDER, filename='TokenGraph', directed=False)
    tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
    return tk_graph, docs_mapping
-def run_graph_postprocessing(
+def run_graph_postprocessing() -> None:
-    tk_graph: TokenGraph,
+    # load entry point
-) -> TokenGraph:
+    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TK_GRAPH_POST)
    loaded_results = cast(
        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
        load_pickle(entry_point_path),
    )
    tk_graph = loaded_results[0]
    # filter graph by edge weight and remove single nodes (no connection)
-    tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
+    ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
-    tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
+    tk_graph_filtered = ret[0]
-    tk_graph_filtered.save_graph(
+    # tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT, None)
    # tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
    tk_graph_filtered.to_GraphML(
        SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
    )
    tk_graph_filtered.to_pickle(
        SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
    )
    return tk_graph_filtered
-def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
+# ** time analysis
-    filename = 'without_nan'
+def run_time_analysis() -> None:
-    loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+    # load entry point
-    verify_path(loading_path)
+    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
-    ret = load_pickle(loading_path)
+    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
-    preprocessed_data = ret[0]
+    preprocessed_data = loaded_results[0]
-    ret = cast(
+    _ = cast(
        tuple[TimelineCandidates, dict[ObjectID, str]],
        pipe_timeline.run(starting_values=(preprocessed_data,)),
    )
    return ret
-def verify_path(
+def build_pipeline_container() -> PipelineContainer:
-    loading_path: Path,
+    container = PipelineContainer(
-) -> None:
+        name='Pipeline-Container-Base', working_dir=SAVE_PATH_FOLDER
-    if not loading_path.exists():
+    )
-        raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
+    container.add(run_preprocessing, skip=SKIP_PREPROCESSING)
    container.add(run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
    container.add(run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
    container.add(run_time_analysis, skip=SKIP_TIME_ANALYSIS)
    return container
 def main() -> None:
-    pre_step_skipped: bool = False
+    procedure = build_pipeline_container()
-    # ** preprocess
+    procedure.run()
    if DO_PREPROCESSING and not SKIP_PREPROCESSING:
        preprocessed_data = run_preprocessing()
    elif not SKIP_PREPROCESSING:
        # !! hardcoded result filenames
        target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
        loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
        verify_path(loading_path)
        ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
        preprocessed_data = ret[0]
    else:
        pre_step_skipped = True
        warnings.warn('No preprocessing action selected. Skipped.')
    # sys.exit(0)
    # ** token analysis
    if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
        if pre_step_skipped:
            raise RuntimeError(
                'Preprocessing step skipped. Token analysis cannot be performed.'
            )
        preprocessed_data_trunc = typing.cast(
            DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
        )  # type: ignore
        tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
    elif not SKIP_TOKEN_ANALYSIS:
        # !! hardcoded result filenames
        # whole graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph'
        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
        verify_path(loading_path)
        # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph = TokenGraph.from_pickle(loading_path)
        pre_step_skipped = False
    else:
        pre_step_skipped = True
        warnings.warn('No token analysis action selected. Skipped.')
    # ** graph postprocessing
    if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
        if pre_step_skipped:
            raise RuntimeError(
                (
                    'Preprocessing or token analysis step skipped. '
                    'Graph postprocessing cannot be performed.'
                )
            )
        tk_graph_filtered = run_graph_postprocessing(tk_graph)
    elif not SKIP_GRAPH_POSTPROCESSING:
        # !! hardcoded result filenames
        # filtered graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
        verify_path(loading_path)
        # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
        pre_step_skipped = False
    else:
        warnings.warn('No graph postprocessing action selected. Skipped.')
    # ** time analysis
    if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
        # no check for fails, runs separately
        ret = run_time_analysis()
    elif not SKIP_TIME_ANALYSIS:
        ...
    else:
        warnings.warn('No time analysis action selected. Skipped.')
 if __name__ == '__main__':
--- a/scripts/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/scripts/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/scripts/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/scripts/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/scripts/dashboard/app.py
+++ b/scripts/dashboard/app.py
@ -1,190 +0,0 @@
 import time
 import webbrowser
 from pathlib import Path
 from threading import Thread
 from typing import cast
 import pandas as pd
 import plotly.express as px
 from dash import (
    Dash,
    Input,
    Output,
    State,
    callback,
    dash_table,
    dcc,
    html,
 )
 from lang_main.io import load_pickle
 from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame
 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
 # ** data
 p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
 p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
 ret = cast(DataFrame, load_pickle(p_df))
 data = ret[0]
 ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
 cands = ret[0]
 texts = ret[1]
 # p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
 # p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
 # p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
 # data = cast(DataFrame, load_pickle(p_df))
 # cands = cast(TimelineCandidates, load_pickle(p_cands))
 # texts = cast(dict[ObjectID, str], load_pickle(p_map))
 table_feats = [
    'ErstellungsDatum',
    'ErledigungsDatum',
    'VorgangsTypName',
    'VorgangsBeschreibung',
 ]
 table_feats_dates = [
    'ErstellungsDatum',
    'ErledigungsDatum',
 ]
 # ** graph config
 markers = {
    'size': 12,
    'color': 'yellow',
    'line': {
        'width': 2,
        'color': 'red',
    },
 }
 hover_data = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'VorgangsBeschreibung': True,
 }
 app = Dash(prevent_initial_callbacks=True)
 app.layout = [
    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
    html.Div(
        children=[
            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
            dcc.Dropdown(
                list(cands.keys()),
                id='dropdown-selection',
                placeholder='ObjektID auswählen...',
            ),
        ]
    ),
    html.Div(
        children=[
            html.H3(id='object_text'),
            dcc.Dropdown(id='choice-candidates'),
            dcc.Graph(id='graph-output'),
        ]
    ),
    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
 ]
@callback(
    Output('object_text', 'children'),
    Input('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_obj_text(obj_id):
    obj_id = int(obj_id)
    obj_text = texts[obj_id]
    headline = f'HObjektText: {obj_text}'
    return headline
@callback(
    Output('choice-candidates', 'options'),
    Input('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_choice_candidates(obj_id):
    obj_id = int(obj_id)
    cands_obj_id = cands[obj_id]
    choices = list(range(1, len(cands_obj_id) + 1))
    return choices
@callback(
    Output('graph-output', 'figure'),
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_timeline(index, obj_id):
    obj_id = int(obj_id)
    # title
    obj_text = texts[obj_id]
    title = f'HObjektText: {obj_text}'
    # cands
    cands_obj_id = cands[obj_id]
    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
    # figure
    fig = px.line(
        data_frame=df,
        x='ErstellungsDatum',
        y='ObjektID',
        title=title,
        hover_data=hover_data,
    )
    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
    fig.update_xaxes(
        tickformat='%B\n%Y',
        rangeslider_visible=True,
    )
    fig.update_yaxes(type='category')
    fig.update_layout(hovermode='x unified')
    return fig
@callback(
    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_table_candidates(index, obj_id):
    obj_id = int(obj_id)
    # cands
    cands_obj_id = cands[obj_id]
    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
    df = df.filter(items=table_feats, axis=1).sort_values(
        by='ErstellungsDatum', ascending=True
    )
    cols = [{'name': i, 'id': i} for i in df.columns]
    # convert dates to strings
    for col in table_feats_dates:
        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
    table_data = df.to_dict('records')
    return table_data, cols
 def _start_webbrowser():
    host = '127.0.0.1'
    port = '8050'
    adress = f'http://{host}:{port}/'
    time.sleep(2)
    webbrowser.open_new(adress)
 def main():
    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
    webbrowser_thread.start()
    app.run(debug=True)
 if __name__ == '__main__':
    main()
--- a/scripts/dashboard/cyto.py
+++ b/scripts/dashboard/cyto.py
@ -1,9 +1,9 @@
 import copy
 import time
 import webbrowser
 from pathlib import Path
 from threading import Thread
 from typing import cast
 import copy
 import dash_cytoscape as cyto
 from dash import Dash, Input, Output, State, dcc, html
@ -30,20 +30,20 @@ app = Dash(__name__, external_stylesheets=external_stylesheets)
 cose_layout = {
    'name': 'cose',
-    'nodeOverlap': 20,
+    'nodeOverlap': 500,
    'refresh': 20,
    'fit': True,
-    'padding': 30,
+    'padding': 20,
-    'randomize': True,
+    'randomize': False,
-    'componentSpacing': 40,
+    'componentSpacing': 1.2,
-    'nodeRepulsion': 2000,
+    'nodeRepulsion': 1000,
    'edgeElasticity': 1000,
    'idealEdgeLength': 100,
    'nestingFactor': 1.2,
    'gravity': 50,
-    'numIter': 2000,
+    'numIter': 3000,
-    'initialTemp': 1000,
+    'initialTemp': 2000,
-    'coolingFactor': 0.95,
+    'coolingFactor': 0.7,
    'minTemp': 1.0,
    'nodeDimensionsIncludeLabels': True,
 }
@ -108,9 +108,8 @@ my_stylesheet = [
    # {'selector': '.triangle', 'style': {'shape': 'triangle'}},
 ]
-app.layout = html.Div(
+layout = html.Div(
    [
        html.Button('Trigger JS Layout', id='test_js'),
        html.Button('Trigger JS Weight', id='test_js_weight'),
        html.Div(id='output'),
        html.Div(
@ -166,11 +165,13 @@ app.layout = html.Div(
                    style={'width': '40%'},
                ),
                html.H3('Graph'),
                html.Button('Re-Layout', id='trigger_relayout'),
                html.Div(
                    [
                        cyto.Cytoscape(
                            id='cytoscape-graph',
                            style={'width': '100%', 'height': '600px'},
                            layout=cose_layout,
                            stylesheet=my_stylesheet,
                            elements=cyto_data_base,
                            zoom=1,
@ -192,6 +193,9 @@ app.layout = html.Div(
 )
 app.layout = layout
@app.callback(
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
    Input('layout_choice', 'value'),
@ -266,17 +270,17 @@ app.clientside_callback(
    """
    function(n_clicks, layout) {
        layout.edgeElasticity = function(edge) {
-            return edge.data().weight * 4;
+            return edge.data().weight * 0.05;
        };
        layout.idealEdgeLength = function(edge) {
-            return edge.data().weight * 0.8;
+            return edge.data().weight * 0.4;
        };
        cy.layout(layout).run();
        return layout;
    }
    """,
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
-    Input('test_js', 'n_clicks'),
+    Input('trigger_relayout', 'n_clicks'),
    State('cytoscape-graph', 'layout'),
    prevent_initial_call=True,
 )
--- a/scripts/dashboard/cyto_2.py
+++ b/scripts/dashboard/cyto_2.py
@ -1,368 +0,0 @@
 import json
 import os
 import dash
 import dash_cytoscape as cyto
 from dash import Input, Output, State, callback, dcc, html
 # Load extra layouts
 cyto.load_extra_layouts()
 # Display utility functions
 def _merge(a, b):
    return dict(a, **b)
 def _omit(omitted_keys, d):
    return {k: v for k, v in d.items() if k not in omitted_keys}
 # Custom Display Components
 def Card(children, **kwargs):
    return html.Section(
        children,
        style=_merge(
            {
                'padding': 20,
                'margin': 5,
                'borderRadius': 5,
                'border': 'thin lightgrey solid',
                'background-color': 'white',
                # Remove possibility to select the text for better UX
                'user-select': 'none',
                '-moz-user-select': 'none',
                '-webkit-user-select': 'none',
                '-ms-user-select': 'none',
            },
            kwargs.get('style', {}),
        ),
        **_omit(['style'], kwargs),
    )
 def SectionTitle(title, size, align='center', color='#222'):
    return html.Div(
        style={'text-align': align, 'color': color},
        children=dcc.Markdown('#' * size + ' ' + title),
    )
 def NamedCard(title, size, children, **kwargs):
    size = min(size, 6)
    size = max(size, 1)
    return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
 def NamedSlider(name, **kwargs):
    return html.Div(
        style={'padding': '20px 10px 25px 4px'},
        children=[
            html.P(f'{name}:'),
            html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
        ],
    )
 def NamedDropdown(name, **kwargs):
    return html.Div(
        style={'margin': '10px 0px'},
        children=[
            html.P(children=f'{name}:', style={'margin-left': '3px'}),
            dcc.Dropdown(**kwargs),
        ],
    )
 def NamedRadioItems(name, **kwargs):
    return html.Div(
        style={'padding': '20px 10px 25px 4px'},
        children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
    )
 def NamedInput(name, **kwargs):
    return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
 # Utils
 def DropdownOptionsList(*args):
    return [{'label': val.capitalize(), 'value': val} for val in args]
 asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
 app = dash.Dash(__name__, assets_folder=asset_path)
 server = app.server
 # ###################### DATA PREPROCESSING ######################
 # Load data
 with open('sample_network.txt', 'r', encoding='utf-8') as f:
    network_data = f.read().split('\n')
 # We select the first 750 edges and associated nodes for an easier visualization
 edges = network_data[:750]
 nodes = set()
 following_node_di = {}  # user id -> list of users they are following
 following_edges_di = {}  # user id -> list of cy edges starting from user id
 followers_node_di = {}  # user id -> list of followers (cy_node format)
 followers_edges_di = {}  # user id -> list of cy edges ending at user id
 cy_edges = []
 cy_nodes = []
 for edge in edges:
    if ' ' not in edge:
        continue
    source, target = edge.split(' ')
    cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
    cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
    cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
    if source not in nodes:
        nodes.add(source)
        cy_nodes.append(cy_source)
    if target not in nodes:
        nodes.add(target)
        cy_nodes.append(cy_target)
    # Process dictionary of following
    if not following_node_di.get(source):
        following_node_di[source] = []
    if not following_edges_di.get(source):
        following_edges_di[source] = []
    following_node_di[source].append(cy_target)
    following_edges_di[source].append(cy_edge)
    # Process dictionary of followers
    if not followers_node_di.get(target):
        followers_node_di[target] = []
    if not followers_edges_di.get(target):
        followers_edges_di[target] = []
    followers_node_di[target].append(cy_source)
    followers_edges_di[target].append(cy_edge)
 genesis_node = cy_nodes[0]
 genesis_node['classes'] = 'genesis'
 default_elements = [genesis_node]
 default_stylesheet = [
    {'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
    {
        'selector': 'edge',
        'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
    },
    {'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
    {
        'selector': '.followerEdge',
        'style': {
            'mid-target-arrow-color': 'blue',
            'mid-target-arrow-shape': 'vee',
            'line-color': '#0074D9',
        },
    },
    {'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
    {
        'selector': '.followingEdge',
        'style': {
            'mid-target-arrow-color': 'red',
            'mid-target-arrow-shape': 'vee',
            'line-color': '#FF4136',
        },
    },
    {
        'selector': '.genesis',
        'style': {
            'background-color': '#B10DC9',
            'border-width': 2,
            'border-color': 'purple',
            'border-opacity': 1,
            'opacity': 1,
            'label': 'data(label)',
            'color': '#B10DC9',
            'text-opacity': 1,
            'font-size': 12,
            'z-index': 9999,
        },
    },
    {
        'selector': ':selected',
        'style': {
            'border-width': 2,
            'border-color': 'black',
            'border-opacity': 1,
            'opacity': 1,
            'label': 'data(label)',
            'color': 'black',
            'font-size': 12,
            'z-index': 9999,
        },
    },
 ]
 # ################################# APP LAYOUT ################################
 styles = {
    'json-output': {
        'overflow-y': 'scroll',
        'height': 'calc(50% - 25px)',
        'border': 'thin lightgrey solid',
    },
    'tab': {'height': 'calc(98vh - 80px)'},
 }
 app.layout = html.Div(
    [
        html.Div(
            className='eight columns',
            children=[
                cyto.Cytoscape(
                    id='cytoscape',
                    elements=default_elements,
                    stylesheet=default_stylesheet,
                    style={'height': '95vh', 'width': '100%'},
                )
            ],
        ),
        html.Div(
            className='four columns',
            children=[
                dcc.Tabs(
                    id='tabs',
                    children=[
                        dcc.Tab(
                            label='Control Panel',
                            children=[
                                NamedDropdown(
                                    name='Layout',
                                    id='dropdown-layout',
                                    options=DropdownOptionsList(
                                        'random',
                                        'grid',
                                        'circle',
                                        'concentric',
                                        'breadthfirst',
                                        'cose',
                                        'cose-bilkent',
                                        'dagre',
                                        'cola',
                                        'klay',
                                        'spread',
                                        'euler',
                                    ),
                                    value='grid',
                                    clearable=False,
                                ),
                                NamedRadioItems(
                                    name='Expand',
                                    id='radio-expand',
                                    options=DropdownOptionsList('followers', 'following'),
                                    value='followers',
                                ),
                            ],
                        ),
                        dcc.Tab(
                            label='JSON',
                            children=[
                                html.Div(
                                    style=styles['tab'],
                                    children=[
                                        html.P('Node Object JSON:'),
                                        html.Pre(
                                            id='tap-node-json-output',
                                            style=styles['json-output'],
                                        ),
                                        html.P('Edge Object JSON:'),
                                        html.Pre(
                                            id='tap-edge-json-output',
                                            style=styles['json-output'],
                                        ),
                                    ],
                                )
                            ],
                        ),
                    ],
                ),
            ],
        ),
    ]
 )
 # ############################## CALLBACKS ####################################
@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
 def display_tap_node(data):
    return json.dumps(data, indent=2)
@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
 def display_tap_edge(data):
    return json.dumps(data, indent=2)
@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
 def update_cytoscape_layout(layout):
    return {'name': layout}
@callback(
    Output('cytoscape', 'elements'),
    Input('cytoscape', 'tapNodeData'),
    State('cytoscape', 'elements'),
    State('radio-expand', 'value'),
 )
 def generate_elements(nodeData, elements, expansion_mode):
    if not nodeData:
        return default_elements
    # If the node has already been expanded, we don't expand it again
    if nodeData.get('expanded'):
        return elements
    # This retrieves the currently selected element, and tag it as expanded
    for element in elements:
        if nodeData['id'] == element.get('data').get('id'):
            element['data']['expanded'] = True
            break
    if expansion_mode == 'followers':
        followers_nodes = followers_node_di.get(nodeData['id'])
        followers_edges = followers_edges_di.get(nodeData['id'])
        if followers_nodes:
            for node in followers_nodes:
                node['classes'] = 'followerNode'
            elements.extend(followers_nodes)
        if followers_edges:
            for follower_edge in followers_edges:
                follower_edge['classes'] = 'followerEdge'
            elements.extend(followers_edges)
    elif expansion_mode == 'following':
        following_nodes = following_node_di.get(nodeData['id'])
        following_edges = following_edges_di.get(nodeData['id'])
        if following_nodes:
            for node in following_nodes:
                if node['data']['id'] != genesis_node['data']['id']:
                    node['classes'] = 'followingNode'
                    elements.append(node)
        if following_edges:
            for follower_edge in following_edges:
                follower_edge['classes'] = 'followingEdge'
            elements.extend(following_edges)
    return elements
 if __name__ == '__main__':
    app.run_server(debug=True)
--- a/scripts/dashboard/sample_network.txt
+++ b/scripts/dashboard/sample_network.txt
--- a/scripts/dashboard/timeline.py
+++ b/scripts/dashboard/timeline.py
@ -0,0 +1,507 @@
 import time
 import webbrowser
 from pathlib import Path
 from threading import Thread
 from typing import cast
 import dash_cytoscape as cyto
 import pandas as pd
 import plotly.express as px
 from dash import (
    Dash,
    Input,
    Output,
    State,
    callback,
    dash_table,
    dcc,
    html,
 )
 from pandas import DataFrame
 from lang_main.analysis import graphs
 from lang_main.io import load_pickle
 from lang_main.types import ObjectID, TimelineCandidates
 from lang_main.analysis import tokens
 from lang_main.constants import SPCY_MODEL
 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
 # ** data
 # p_df = Path(r'../Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
 p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
 # p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
 p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
 ret = cast(tuple[DataFrame], load_pickle(p_df))
 data = ret[0]
 ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
 cands = ret[0]
 texts = ret[1]
 # p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
 # p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
 # p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
 # data = cast(DataFrame, load_pickle(p_df))
 # cands = cast(TimelineCandidates, load_pickle(p_cands))
 # texts = cast(dict[ObjectID, str], load_pickle(p_map))
 table_feats = [
    'ErstellungsDatum',
    'ErledigungsDatum',
    'VorgangsTypName',
    'VorgangsBeschreibung',
 ]
 table_feats_dates = [
    'ErstellungsDatum',
    'ErledigungsDatum',
 ]
 # ** figure config
 markers = {
    'size': 12,
    'color': 'yellow',
    'line': {
        'width': 2,
        'color': 'red',
    },
 }
 hover_data = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'VorgangsBeschreibung': True,
 }
 # ** graphs
 target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
 p = Path(target).resolve()
 ret = load_pickle(p)
 tk_graph = cast(graphs.TokenGraph, ret[0])
 tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, 150, None)
 tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
 # tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None)
 # tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
 cyto_data_base, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
 MIN_WEIGHT = weight_data['min']
 MAX_WEIGHT = weight_data['max']
 cyto.load_extra_layouts()
 cose_layout = {
    'name': 'cose',
    'nodeOverlap': 500,
    'refresh': 20,
    'fit': True,
    'padding': 20,
    'randomize': False,
    'componentSpacing': 1.2,
    'nodeRepulsion': 1000,
    'edgeElasticity': 1000,
    'idealEdgeLength': 100,
    'nestingFactor': 1.2,
    'gravity': 50,
    'numIter': 3000,
    'initialTemp': 2000,
    'coolingFactor': 0.7,
    'minTemp': 1.0,
    'nodeDimensionsIncludeLabels': True,
 }
 my_stylesheet = [
    # Group selectors
    {
        'selector': 'node',
        'style': {
            'shape': 'circle',
            'content': 'data(label)',
            'background-color': '#B10DC9',
            'border-width': 2,
            'border-color': 'black',
            'border-opacity': 1,
            'opacity': 1,
            'color': 'black',
            'text-opacity': 1,
            'font-size': 12,
            'z-index': 9999,
        },
    },
    {
        'selector': 'edge',
        'style': {
            #'width': f'mapData(weight, {MIN_WEIGHT}, {MAX_WEIGHT}, 1, 10)',
            # 'width': """function(ele) {
            #     return ele.data('weight');
            # """,
            'curve-style': 'bezier',
            'line-color': 'grey',
            'line-style': 'solid',
            'line-opacity': 1,
        },
    },
    # Class selectors
    # {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
    # {'selector': '.triangle', 'style': {'shape': 'triangle'}},
 ]
 # ** app
 external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
 app = Dash(__name__, external_stylesheets=external_stylesheets)
 graph_layout = html.Div(
    [
        html.Button('Trigger JS Weight', id='test_js_weight'),
        html.Button('Trigger Candidate Graph', id='cand_graph'),
        html.Div(id='output'),
        html.Div(
            [
                html.H2('Token Graph', style={'margin': 0}),
                html.Button(
                    'Reset Default',
                    id='bt-reset',
                    style={
                        'marginLeft': 'auto',
                        'width': '300px',
                    },
                ),
            ],
            style={
                'display': 'flex',
                'marginBottom': '1em',
            },
        ),
        html.H3('Layout'),
        dcc.Dropdown(
            id='layout_choice',
            options=[
                'cose',
                'cola',
                'euler',
                'random',
            ],
            value='cose',
            clearable=False,
        ),
        html.Div(
            [
                html.H3('Graph Filter'),
                dcc.Input(
                    id='weight_min',
                    type='number',
                    min=MIN_WEIGHT,
                    max=MAX_WEIGHT,
                    step=1,
                    placeholder=f'Minimum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
                    debounce=True,
                    style={'width': '40%'},
                ),
                dcc.Input(
                    id='weight_max',
                    type='number',
                    min=MIN_WEIGHT,
                    max=MAX_WEIGHT,
                    step=1,
                    placeholder=f'Maximum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
                    debounce=True,
                    style={'width': '40%'},
                ),
                html.H3('Graph'),
                html.Button('Re-Layout', id='trigger_relayout'),
                html.Div(
                    [
                        cyto.Cytoscape(
                            id='cytoscape-graph',
                            style={'width': '100%', 'height': '600px'},
                            layout=cose_layout,
                            stylesheet=my_stylesheet,
                            elements=cyto_data_base,
                            zoom=1,
                        ),
                    ],
                    style={
                        'border': '3px solid black',
                        'borderRadius': '25px',
                        'marginTop': '1em',
                        'marginBottom': '2em',
                        'padding': '7px',
                    },
                ),
            ],
            style={'marginTop': '1em'},
        ),
    ],
 )
 app.layout = html.Div(
    [
        html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
        html.Div(
            children=[
                html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
                dcc.Dropdown(
                    list(cands.keys()),
                    id='dropdown-selection',
                    placeholder='ObjektID auswählen...',
                ),
            ]
        ),
        html.Div(
            children=[
                html.H3(id='object_text'),
                dcc.Dropdown(id='choice-candidates'),
                dcc.Graph(id='graph-output'),
            ]
        ),
        html.Div(
            [dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
        ),
        graph_layout,
    ],
    style={'margin': '2em'},
 )
@callback(
    Output('object_text', 'children'),
    Input('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_obj_text(obj_id):
    obj_id = int(obj_id)
    obj_text = texts[obj_id]
    headline = f'HObjektText: {obj_text}'
    return headline
@callback(
    Output('choice-candidates', 'options'),
    Input('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_choice_candidates(obj_id):
    obj_id = int(obj_id)
    cands_obj_id = cands[obj_id]
    choices = list(range(1, len(cands_obj_id) + 1))
    return choices
@callback(
    Output('graph-output', 'figure'),
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_timeline(index, obj_id):
    obj_id = int(obj_id)
    # title
    obj_text = texts[obj_id]
    title = f'HObjektText: {obj_text}'
    # cands
    cands_obj_id = cands[obj_id]
    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
    # figure
    fig = px.line(
        data_frame=df,
        x='ErstellungsDatum',
        y='ObjektID',
        title=title,
        hover_data=hover_data,
    )
    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
    fig.update_xaxes(
        tickformat='%B\n%Y',
        rangeslider_visible=True,
    )
    fig.update_yaxes(type='category')
    fig.update_layout(hovermode='x unified')
    return fig
@callback(
    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_table_candidates(index, obj_id):
    # obj_id = int(obj_id)
    # # cands
    # cands_obj_id = cands[obj_id]
    # cands_choice = cands_obj_id[int(index) - 1]
    # # data
    # df = data.loc[list(cands_choice)].sort_index()  # type: ignore
    df = pre_filter_data(data, idx=index, obj_id=obj_id)
    df = df.filter(items=table_feats, axis=1).sort_values(
        by='ErstellungsDatum', ascending=True
    )
    cols = [{'name': i, 'id': i} for i in df.columns]
    # convert dates to strings
    for col in table_feats_dates:
        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
    table_data = df.to_dict('records')
    return table_data, cols
 def pre_filter_data(
    data: DataFrame,
    idx: int,
    obj_id: ObjectID,
 ) -> DataFrame:
    obj_id = int(obj_id)
    data = data.copy()
    # cands
    cands_obj_id = cands[obj_id]
    cands_choice = cands_obj_id[int(idx) - 1]
    # data
    data = data.loc[list(cands_choice)].sort_index()  # type: ignore
    return data
 # ** graph
@app.callback(
    Output('cytoscape-graph', 'elements', allow_duplicate=True),
    Output('weight_min', 'min', allow_duplicate=True),
    Output('weight_min', 'max', allow_duplicate=True),
    Output('weight_min', 'placeholder', allow_duplicate=True),
    Output('weight_max', 'min', allow_duplicate=True),
    Output('weight_max', 'max', allow_duplicate=True),
    Output('weight_max', 'placeholder', allow_duplicate=True),
    Input('cand_graph', 'n_clicks'),
    State('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
 )
 def update_graph_candidates(_, index, obj_id):
    df = pre_filter_data(data, idx=index, obj_id=obj_id)
    tk_graph_cands, _ = tokens.build_token_graph(
        data=df,
        model=SPCY_MODEL,
        target_feature='VorgangsBeschreibung',
        build_map=False,
    )
    cyto_data, weight_info = graphs.convert_graph_to_cytoscape(tk_graph_cands)
    weight_min = weight_info['min']
    weight_max = weight_info['max']
    placeholder_min = f'Minimum edge weight: {weight_min} - {weight_max}'
    placeholder_max = f'Minimum edge weight: {weight_min} - {weight_max}'
    return (
        cyto_data,
        weight_min,
        weight_max,
        placeholder_min,
        weight_min,
        weight_max,
        placeholder_max,
    )
@app.callback(
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
    Input('layout_choice', 'value'),
    prevent_initial_call=True,
 )
 def update_layout_internal(layout_choice):
    # return {'name': layout_choice}
    return cose_layout
    # return cose_bilkent_layout
    # return cola_layout
@app.callback(
    Output('cytoscape-graph', 'zoom'),
    Output('cytoscape-graph', 'elements', allow_duplicate=True),
    Output('weight_min', 'value'),
    Output('weight_max', 'value'),
    Input('bt-reset', 'n_clicks'),
    prevent_initial_call=True,
 )
 def reset_layout(n_clicks):
    return (1, cyto_data_base, None, None)
 # update edge weight
@app.callback(
    Output('cytoscape-graph', 'elements', allow_duplicate=True),
    Input('weight_min', 'value'),
    Input('weight_max', 'value'),
    prevent_initial_call=True,
 )
 def update_edge_weight(weight_min, weight_max):
    if not any([weight_min, weight_max]):
        return cyto_data_base
    if weight_min is None:
        weight_min = MIN_WEIGHT
    if weight_max is None:
        weight_max = MAX_WEIGHT
    tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, weight_min, weight_max)
    # tk_graph_filtered = tk_graph.filter_by_edge_weight(weight_min, weight_max)
    tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
    # tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
    cyto_data, _ = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
    return cyto_data
 app.clientside_callback(
    """
    function(n_clicks, layout) {
        layout.edgeElasticity = function(edge) {
            return edge.data().weight * 0.05;
        };
        layout.idealEdgeLength = function(edge) {
            return edge.data().weight * 0.4;
        };
        cy.layout(layout).run();
        return layout;
    }
    """,
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
    Input('trigger_relayout', 'n_clicks'),
    State('cytoscape-graph', 'layout'),
    prevent_initial_call=True,
 )
 app.clientside_callback(
    """
    function(n_clicks, stylesheet) {
        function edge_weight(ele) {
            let threshold = 1000;
            let weight = ele.data('weight');
            if (weight > threshold) {
                weight = 12;
            } else {
                weight = weight / threshold * 10;
                weight = Math.max(1, weight);
            }
            return weight;
        }
        stylesheet[1].style.width = edge_weight;
        cy.style(stylesheet).update();
        return stylesheet;
    }
    """,
    Output('cytoscape-graph', 'stylesheet'),
    Input('test_js_weight', 'n_clicks'),
    State('cytoscape-graph', 'stylesheet'),
    prevent_initial_call=False,
 )
 def _start_webbrowser():
    host = '127.0.0.1'
    port = '8050'
    adress = f'http://{host}:{port}/'
    time.sleep(2)
    webbrowser.open_new(adress)
 def main():
    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
    webbrowser_thread.start()
    app.run(debug=True)
 if __name__ == '__main__':
    main()
--- a/scripts/lang_main_config
+++ b/scripts/lang_main_config
@ -1,38 +0,0 @@
 # lang_main: Config file
 [paths]
 inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
 results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
 dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
 preprocessing = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = true
 graph_postprocessing = false
 graph_postprocessing_skip = true
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum", 
    "ErledigungsDatum", 
    "Arbeitsbeginn", 
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_weight = 150
 [time_analysis]
 threshold_unique_texts = 5
--- a/scripts/lang_main_config.toml
+++ b/scripts/lang_main_config.toml
@ -2,22 +2,20 @@
 [paths]
 inputs = './inputs/'
-results = './results/test_20240529/'
+results = './results/test_20240619/'
 dataset = '../data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
-preprocessing = false
+preprocessing_skip = true
-preprocessing_skip = false
+token_analysis_skip = true
 token_analysis = true
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = true
-time_analysis = false
+time_analysis_skip = false
 time_analysis_skip = true
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -1,3 +1,5 @@
 from __future__ import annotations
 import copy
 import sys
 import typing
@ -169,6 +171,90 @@ def convert_graph_to_cytoscape(
    return cyto_data, weight_metadata
 def filter_graph_by_edge_weight(
    graph: TokenGraph,
    bound_lower: int | None,
    bound_upper: int | None,
 ) -> TokenGraph:
    """filters all edges which are within the provided bounds
    Parameters
    ----------
    bound_lower : int | None
        lower bound for edge weights, edges with weight equal to this value are retained
    bound_upper : int | None
        upper bound for edge weights, edges with weight equal to this value are retained
    Returns
    -------
    TokenGraph
        a copy of the graph with filtered edges
    """
    original_graph_edges = copy.deepcopy(graph.edges)
    filtered_graph = graph.copy()
    if not any([bound_lower, bound_upper]):
        logger.warning('No bounds provided, returning original graph.')
        return filtered_graph
    for edge in original_graph_edges:
        weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
        if bound_lower is not None and weight < bound_lower:
            filtered_graph.remove_edge(edge[0], edge[1])
        if bound_upper is not None and weight > bound_upper:
            filtered_graph.remove_edge(edge[0], edge[1])
    if filtered_graph._undirected is not None:
        filtered_graph.to_undirected(inplace=True, logging=False)
    filtered_graph.update_metadata(logging=False)
    return filtered_graph
 def filter_graph_by_node_degree(
    graph: TokenGraph,
    bound_lower: int | None,
    bound_upper: int | None,
 ) -> TokenGraph:
    """filters all nodes which are within the provided bounds by their degree
    Parameters
    ----------
    bound_lower : int | None
        lower bound for node degree, nodes with degree equal to this value are retained
    bound_upper : int | None
        upper bound for node degree, nodes with degree equal to this value are retained
    Returns
    -------
    TokenGraph
        a copy of the graph with filtered nodes
    """
    # filter nodes by degree
    original_graph_nodes = copy.deepcopy(graph.nodes)
    filtered_graph = graph.copy()
    if not any([bound_lower, bound_upper]):
        logger.warning('No bounds provided, returning original graph.')
        return filtered_graph
    for node in original_graph_nodes:
        degree = filtered_graph.degree[node]  # type: ignore
        if bound_lower is not None and degree < bound_lower:
            filtered_graph.remove_node(node)
        if bound_upper is not None and degree > bound_upper:
            filtered_graph.remove_node(node)
    if filtered_graph._undirected is not None:
        filtered_graph.to_undirected(inplace=True, logging=False)
    filtered_graph.update_metadata(logging=False)
    return filtered_graph
 # ** ---------------------------------------
 class TokenGraph(DiGraph):
    def __init__(
        self,
@ -286,87 +372,6 @@ class TokenGraph(DiGraph):
                graph=self._undirected, logging=logging
            )
    def filter_by_edge_weight(
        self,
        bound_lower: int | None,
        bound_upper: int | None,
    ) -> Self:
        """filters all edges which are within the provided bounds
        Parameters
        ----------
        bound_lower : int | None
            lower bound for edge weights, edges with weight equal to this value are retained
        bound_upper : int | None
            upper bound for edge weights, edges with weight equal to this value are retained
        Returns
        -------
        Self
            a copy of the graph with filtered edges
        """
        original_graph_edges = copy.deepcopy(self.edges)
        filtered_graph = self.copy()
        if not any([bound_lower, bound_upper]):
            logger.warning('No bounds provided, returning original graph.')
            return filtered_graph
        for edge in original_graph_edges:
            weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
            if bound_lower is not None and weight < bound_lower:
                filtered_graph.remove_edge(edge[0], edge[1])
            if bound_upper is not None and weight > bound_upper:
                filtered_graph.remove_edge(edge[0], edge[1])
        if filtered_graph._undirected is not None:
            filtered_graph.to_undirected(inplace=True, logging=False)
        filtered_graph.update_metadata(logging=False)
        return filtered_graph
    def filter_by_node_degree(
        self,
        bound_lower: int | None,
        bound_upper: int | None,
    ) -> Self:
        """filters all nodes which are within the provided bounds by their degree
        Parameters
        ----------
        bound_lower : int | None
            lower bound for node degree, nodes with degree equal to this value are retained
        bound_upper : int | None
            upper bound for node degree, nodes with degree equal to this value are retained
        Returns
        -------
        Self
            a copy of the graph with filtered nodes
        """
        # filter nodes by degree
        original_graph_nodes = copy.deepcopy(self.nodes)
        filtered_graph = self.copy()
        if not any([bound_lower, bound_upper]):
            logger.warning('No bounds provided, returning original graph.')
            return filtered_graph
        for node in original_graph_nodes:
            degree = filtered_graph.degree[node]  # type: ignore
            if bound_lower is not None and degree < bound_lower:
                filtered_graph.remove_node(node)
            if bound_upper is not None and degree > bound_upper:
                filtered_graph.remove_node(node)
        if filtered_graph._undirected is not None:
            filtered_graph.to_undirected(inplace=True, logging=False)
        filtered_graph.update_metadata(logging=False)
        return filtered_graph
    def _save_prepare(
        self,
        path: Path,
@ -379,14 +384,13 @@ class TokenGraph(DiGraph):
        return saving_path
-    def save_graph(
+    def to_GraphML(
        self,
        path: Path,
        filename: str | None = None,
        directed: bool = False,
    ) -> None:
-        """save one of the stored graphs to disk file,
+        """save one of the stored graphs to GraphML format on disk,
        currently only GraphML format is supported
        Parameters
        ----------
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -22,7 +22,7 @@ from lang_main.analysis.shared import (
    similar_index_groups,
 )
 from lang_main.loggers import logger_preprocess as logger
-from lang_main.pipelines.base import BasePipeline
+from lang_main.pipelines.base import Pipeline
 from lang_main.types import Embedding, PandasIndex
 # ** RE patterns
@ -119,10 +119,9 @@ def remove_duplicates(
    ).copy()
    logger.info('Removed all duplicates from dataset successfully.')
    logger.info(
-        (
+        'New Dataset properties: number of entries: %d, number of features %d',
-            f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
+        len(wo_duplicates),
-            f'number of features {len(wo_duplicates.columns)}'
+        len(wo_duplicates.columns),
        )
    )
    return (wo_duplicates,)
@ -176,6 +175,7 @@ def clean_string_slim(string: str) -> str:
    string = pattern_special_chars.sub(' ', string)
    string = pattern_repeated_chars.sub(r'\1', string)
    # string = pattern_dates.sub('', string)
    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
    string = string.strip()
@ -241,11 +241,84 @@ def analyse_feature(
    return (result_df,)
 # ** pre-filter
 def numeric_pre_filter_feature(
    data: DataFrame,
    feature: str,
    bound_lower: int | None,
    bound_upper: int | None,
 ) -> tuple[DataFrame]:
    if not any([bound_lower, bound_upper]):
        raise ValueError('No bounds for filtering provided')
    data = data.copy()
    if bound_lower is None:
        bound_lower = cast(int, data[feature].min())
    if bound_upper is None:
        bound_upper = cast(int, data[feature].max())
    filter_lower = data[feature] >= bound_lower
    filter_upper = data[feature] <= bound_upper
    filter = filter_lower & filter_upper
    data = data.loc[filter]
    return (data,)
 # ** embedding based similarity
 # following functions used to identify similar entries to have
 # a more robust identification of duplicates negating negative side effects
 # of several disturbances like typos, escape characters, etc.
 # build mapping of embeddings for given model
 def merge_similarity_dupl(
    data: DataFrame,
    model: SentenceTransformer,
    cos_sim_threshold: float,
 ) -> tuple[DataFrame]:
    logger.info('Start merging of similarity candidates...')
    # data
    merged_data = data.copy()
    model_input = merged_data['entry']
    candidates_idx = candidates_by_index(
        data_model_input=model_input,
        model=model,
        cos_sim_threshold=cos_sim_threshold,
    )
    # graph of similar ids
    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
    for similar_id_group in similar_index_groups(similar_id_graph):
        similar_id_group = list(similar_id_group)
        similar_data = merged_data.loc[similar_id_group, :]
        # keep first entry with max number occurrences, then number of
        # associated objects, then length of entry
        similar_data = similar_data.sort_values(
            by=['num_occur', 'num_assoc_obj_ids', 'len'],
            ascending=[False, False, False],
        )
        # merge information to first entry
        data_idx = cast(PandasIndex, similar_data.index[0])
        similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
        assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
        assoc_obj_ids = np.concatenate(assoc_obj_ids)
        assoc_obj_ids = np.unique(assoc_obj_ids)
        similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
        similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
        # remaining indices, should be removed
        similar_id_group.remove(data_idx)
        merged_similar_data = similar_data.drop(index=similar_id_group)
        # update entry in main dataset, drop remaining entries
        merged_data.update(merged_similar_data)
        merged_data = merged_data.drop(index=similar_id_group)
    logger.info('Similarity candidates merged successfully.')
    return (merged_data,)
 #####################################################################
 def build_embedding_map(
    data: Series,
    model: GermanSpacyModel | SentenceTransformer,
@ -373,7 +446,7 @@ def list_cosSim_dupl_candidates(
    save_candidates: bool = False,
    saving_path: Path | None = None,
    filename: str = 'CosSim-FilterCandidates',
-    pipeline: BasePipeline | None = None,
+    pipeline: Pipeline | None = None,
 ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
    """providing an overview of candidates with a similarity score greater than
    given threshold; more suitable for debugging purposes
@ -465,53 +538,6 @@ def similar_ids_groups(
 """
 def merge_similarity_dupl(
    data: DataFrame,
    model: SentenceTransformer,
    cos_sim_threshold: float,
 ) -> tuple[DataFrame]:
    logger.info('Start merging of similarity candidates...')
    # data
    merged_data = data.copy()
    model_input = merged_data['entry']
    candidates_idx = candidates_by_index(
        data_model_input=model_input,
        model=model,
        cos_sim_threshold=cos_sim_threshold,
    )
    # graph of similar ids
    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
    for similar_id_group in similar_index_groups(similar_id_graph):
        similar_id_group = list(similar_id_group)
        similar_data = merged_data.loc[similar_id_group, :]
        # keep first entry with max number occurrences, then number of
        # associated objects, then length of entry
        similar_data = similar_data.sort_values(
            by=['num_occur', 'num_assoc_obj_ids', 'len'],
            ascending=[False, False, False],
        )
        # merge information to first entry
        data_idx = cast(PandasIndex, similar_data.index[0])
        similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
        assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
        assoc_obj_ids = np.concatenate(assoc_obj_ids)
        assoc_obj_ids = np.unique(assoc_obj_ids)
        similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
        similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
        # remaining indices, should be removed
        similar_id_group.remove(data_idx)
        merged_similar_data = similar_data.drop(index=similar_id_group)
        # update entry in main dataset, drop remaining entries
        merged_data.update(merged_similar_data)
        merged_data = merged_data.drop(index=similar_id_group)
    logger.info('Similarity candidates merged successfully.')
    return (merged_data.copy(),)
 # merge duplicates
 def merge_similarity_dupl_old(
    data: DataFrame,
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -24,13 +24,13 @@ PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
 # if not PATH_TO_DATASET.exists():
 #     raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
 # ** control
-DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
+# DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
 SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
-DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
+# DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
 SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
-DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
+# DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
 SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
-DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
+# DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
 SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
 # ** models
@ -66,11 +66,11 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
 ]
 FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
 # ** time_analysis.model_input
-MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
+MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['input_features']
 )
 ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
-ACTIVITY_TYPES: Final[tuple[str]] = tuple(
+ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['activity_types']
 )
 THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
--- a/src/lang_main/io.py
+++ b/src/lang_main/io.py
@ -1,4 +1,3 @@
 import os
 import pickle
 import shutil
 import tomllib
@ -22,7 +21,7 @@ def create_saving_folder(
        if overwrite_existing:
            # overwrite if desired (deletes whole path and re-creates it)
            shutil.rmtree(saving_path_folder)
-            os.makedirs(saving_path_folder)
+            saving_path_folder.mkdir(parents=True)
        else:
            logger.info(
                (
@ -62,56 +61,14 @@ def load_pickle(
    return obj
-# TODO: remove, too specialised for common application
+def get_entry_point(
-"""
+    saving_path: Path,
-def filter_candidates_idx(
+    filename: str,
-    data_model_input: Series,
+) -> Path:
-    model: SentenceTransformer,
+    entry_point_path = (saving_path / filename).with_suffix('.pkl')
-    cos_sim_threshold: float,
+    if not entry_point_path.exists():
-) -> Iterator[tuple[PandasIndex, PandasIndex]]:
+        raise FileNotFoundError(
-    common function to filter candidate indices based on cosine similarity
+            f'Could not find provided entry data under path: >>{entry_point_path}<<'
    using SentenceTransformer model in batch mode,
    feed of data as Series to retain information about indices of entries
    Parameters
    ----------
    data_model_input : Series
        containing indices and text entries to process
    model : SentenceTransformer
        necessary SentenceTransformer model to encode text entries
    cos_sim_threshold : float
        threshold for cosine similarity to filter candidates
    Yields
    ------
    Iterator[tuple[PandasIndex, PandasIndex]]
        index pairs which meet the cosine similarity threshold
    # embeddings
    batch = typing.cast(list[str],
                        data_model_input.to_list())
    embds = typing.cast(Tensor,
                        model.encode(
                            batch, 
                            convert_to_numpy=False,
                            convert_to_tensor=True,
                            show_progress_bar=False,
                        ))
    # cosine similarity
    cos_sim = typing.cast(
        npt.NDArray,
        sentence_transformers.util.cos_sim(embds, embds).numpy()
    )
    np.fill_diagonal(cos_sim, 0.)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
    for idx_array in cos_sim_idx:
        idx_pair = typing.cast(
            tuple[np.int64, np.int64],
            tuple(data_model_input.index[idx] for idx in idx_array)
        )
-        yield idx_pair
+
-"""
+    return entry_point_path
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -9,14 +9,12 @@ dataset = './01_2_Rohdaten_neu/Export4.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
 preprocessing = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = false
 time_analysis = false
 time_analysis_skip = false
 #[export_filenames]
@ -42,9 +40,12 @@ criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.model_input]
 # input_features = [
 #     'VorgangsTypName',
 #     'VorgangsArtText',
 #     'VorgangsBeschreibung',
 # ]
 input_features = [
    'VorgangsTypName',
    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -1,9 +1,14 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from pathlib import Path
-from typing import Any
+from typing import Any, Never, cast
 from typing_extensions import override
 from lang_main.loggers import logger_pipelines as logger
 from lang_main.io import load_pickle, save_pickle
 from lang_main.loggers import logger_pipelines as logger
 from lang_main.types import ResultHandling
 # ** pipelines to perform given actions on dataset in a customisable manner
@ -12,7 +17,18 @@ class NoPerformableActionError(Exception):
    """Error describing that no action is available in the current pipeline"""
-class BasePipeline:
+class WrongActionTypeError(Exception):
    """Error raised if added action type is not supported by corresponding pipeline"""
 class OutputInPipelineContainerError(Exception):
    """Error raised if an output was detected by one of the performed
    actions in a PipelineContainer. Each action in a PipelineContainer is itself a
    procedure which does not have any parameters or return values and should therefore not
    return any values."""
 class BasePipeline(ABC):
    def __init__(
        self,
        name: str,
@ -25,18 +41,12 @@ class BasePipeline:
        self.name = name
        # working directory for pipeline == output path
        self.working_dir = working_dir
        # if not self.working_dir.exists():
        #     self.working_dir.mkdir(parents=True)
        # container for actions to perform during pass
        self.actions: list[Callable] = []
        self.action_names: list[str] = []
        self.actions_kwargs: list[dict[str, Any]] = []
        self.is_save_result: list[bool] = []
        # progress tracking, start at 1
        self.curr_proc_idx: int = 1
        # intermediate result
        self._intermediate_result: Any | None = None
    def __repr__(self) -> str:
        return (
@ -44,15 +54,132 @@ class BasePipeline:
            f'working dir: {self.working_dir}, contents: {self.action_names})'
        )
-    @property
+    def panic_wrong_action_type(
-    def intermediate_result(self) -> Any:
+        self,
-        return self._intermediate_result
+        action: Any,
        compatible_type: str,
    ) -> Never:
        raise WrongActionTypeError(
            (
                f'Action must be of type {compatible_type}, '
                f'but is of type >>{type(action)}<<.'
            )
        )
    def prep_run(self) -> None:
        logger.info('Starting pipeline >>%s<<...', self.name)
        # progress tracking
        self.curr_proc_idx = 1
        # check if performable actions available
        if len(self.actions) == 0:
            raise NoPerformableActionError(
                'The pipeline does not contain any performable actions.'
            )
    def post_run(self) -> None:
        logger.info(
            'Processing pipeline >>%s<< successfully ended after %d steps.',
            self.name,
            (self.curr_proc_idx - 1),
        )
    @abstractmethod
    def add(self) -> None: ...
    @abstractmethod
    def logic(self) -> None: ...
    def run(self, *args, **kwargs) -> Any:
        self.prep_run()
        ret = self.logic(*args, **kwargs)
        self.post_run()
        return ret
 class PipelineContainer(BasePipeline):
    def __init__(
        self,
        name: str,
        working_dir: Path,
    ) -> None:
        super().__init__(name=name, working_dir=working_dir)
        self.action_skip: list[bool] = []
    @override
    def add(
        self,
        action: Callable,
        skip: bool = False,
    ) -> None:
        if isinstance(action, Callable):
            self.actions.append(action)
            self.action_names.append(action.__name__)
            self.action_skip.append(skip)
        else:
            self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
    @override
    def logic(self) -> None:
        for idx, (action, action_name) in enumerate(zip(self.actions, self.action_names)):
            # loading
            if self.action_skip[idx]:
                logger.info('[No Calculation] Skipping >>%s<<...', action_name)
                self.curr_proc_idx += 1
                continue
            # calculation
            ret = action()
            if ret is not None:
                raise OutputInPipelineContainerError(
                    (
                        f'Output in PipelineContainers not allowed. Action {action_name} '
                        f'returned values in Container {self.name}.'
                    )
                )
            # processing tracking
            self.curr_proc_idx += 1
 class Pipeline(BasePipeline):
    def __init__(
        self,
        name: str,
        working_dir: Path,
    ) -> None:
        # init base class
        super().__init__(name=name, working_dir=working_dir)
        # name of pipeline
        self.name = name
        # working directory for pipeline == output path
        self.working_dir = working_dir
        # if not self.working_dir.exists():
        #     self.working_dir.mkdir(parents=True)
        # container for actions to perform during pass
        self.actions_kwargs: list[dict[str, Any]] = []
        self.save_results: ResultHandling = []
        self.load_results: ResultHandling = []
        # intermediate result
        self._intermediate_result: tuple[Any, ...] | None = None
    def __repr__(self) -> str:
        return (
            f'{self.__class__.__name__}(name: {self.name}, '
            f'working dir: {self.working_dir}, contents: {self.action_names})'
        )
    # @property
    # def intermediate_result(self) -> tuple[Any, ...] | None:
    #     return self._intermediate_result
    @override
    def add(
        self,
        action: Callable,
        action_kwargs: dict[str, Any] = {},
        save_result: bool = False,
        load_result: bool = False,
        filename: str | None = None,
    ) -> None:
        # check explicitly for function type
        # if isinstance(action, FunctionType):
@ -60,11 +187,10 @@ class BasePipeline:
            self.actions.append(action)
            self.action_names.append(action.__name__)
            self.actions_kwargs.append(action_kwargs.copy())
-            self.is_save_result.append(save_result)
+            self.save_results.append((save_result, filename))
            self.load_results.append((load_result, filename))
        else:
-            raise TypeError(
+            self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
                f'Action must be custom function, but is of type >>{type(action)}<<.'
            )
    # TODO: add multiple entries by utilising simple add method
    """
@ -88,57 +214,84 @@ class BasePipeline:
                             f"but is of type >>{type(action)}<<."))
    """
-    def save_curr_result(
+    def get_result_path(
        self,
-        filename: str,
+        action_idx: int,
        filename: str | None,
    ) -> tuple[Path, str]:
        action_name = self.action_names[action_idx]
        if filename is None:
            target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_{action_name}'
        else:
            target_filename = filename
        target_path = self.working_dir.joinpath(target_filename).with_suffix('.pkl')
        return target_path, action_name
    def load_step(
        self,
        action_idx: int,
        filename: str | None,
    ) -> tuple[Any, ...]:
        target_path, action_name = self.get_result_path(action_idx, filename)
        if not target_path.exists():
            raise FileNotFoundError(
                (
                    f'No intermediate results for action >>{action_name}<< '
                    f'under >>{target_path}<< found'
                )
            )
        # results should be tuple, but that is not guaranteed
        result_loaded = cast(tuple[Any, ...], load_pickle(target_path))
        if not isinstance(result_loaded, tuple):
            raise TypeError(f'Loaded results must be tuple, not {type(result_loaded)}')
        return result_loaded
    def save_step(
        self,
        action_idx: int,
        filename: str | None,
    ) -> None:
-        target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
+        # target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
-        target_path = self.working_dir.joinpath(target_filename)
+        # target_path = self.working_dir.joinpath(target_filename)
-        target_path = target_path.with_suffix('.pkl')
+        # target_path = target_path.with_suffix('.pkl')
        target_path, _ = self.get_result_path(action_idx, filename)
        # saving file locally
        save_pickle(obj=self._intermediate_result, path=target_path)
-    def load_intermediate_result(
+    @override
-        self,
+    def logic(
        saving_path: str,
        filename: str,
    ) -> tuple[Any, ...]:
        target_path = Path(saving_path + filename).with_suffix('.pkl')
        # loading DataFrame or Series from pickle
        data = load_pickle(target_path)
        return data
    def prep_run(self) -> None:
        logger.info('Starting processing pipeline >>%s<<...', self.name)
        # progress tracking
        self.curr_proc_idx = 1
        # check if performable actions available
        if len(self.actions) == 0:
            raise NoPerformableActionError(
                'The pipeline does not contain any performable actions.'
            )
    def run(
        self,
        starting_values: tuple[Any, ...],
    ) -> tuple[Any, ...]:
        # prepare start
        self.prep_run()
        for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
            # loading
            if self.load_results[idx][0]:
                filename = self.load_results[idx][1]
                ret = self.load_step(action_idx=idx, filename=filename)
                logger.info(
                    '[No Calculation] Loaded result for action >>%s<< successfully',
                    self.action_names[idx],
                )
                self.curr_proc_idx += 1
                continue
            # calculation
            if idx == 0:
                ret = action(*starting_values, **action_kwargs)
            else:
                ret = action(*ret, **action_kwargs)
            if not isinstance(ret, tuple):
                ret = (ret,)
            ret = cast(tuple[Any, ...], ret)
            # save intermediate result
            self._intermediate_result = ret
-            # check if result should be saved locally
+            # saving result locally, always save last action
-            if self.is_save_result[idx]:
+            if self.save_results[idx][0] or idx == (len(self.actions) - 1):
-                self.save_curr_result(filename=self.action_names[idx])
+                filename = self.save_results[idx][1]
                self.save_step(action_idx=idx, filename=filename)
            # processing tracking
            self.curr_proc_idx += 1
        logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
        return ret
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -1,9 +1,11 @@
 from lang_main.analysis import graphs
 from lang_main.analysis.preprocessing import (
    analyse_feature,
    clean_string_slim,
    entry_wise_cleansing,
    load_raw_data,
    merge_similarity_dupl,
    numeric_pre_filter_feature,
    remove_duplicates,
    remove_NA,
 )
@ -23,40 +25,50 @@ from lang_main.constants import (
    SAVE_PATH_FOLDER,
    SPCY_MODEL,
    STFR_MODEL,
    THRESHOLD_AMOUNT_CHARACTERS,
    THRESHOLD_EDGE_WEIGHT,
    THRESHOLD_NUM_ACTIVITIES,
    THRESHOLD_SIMILARITY,
    THRESHOLD_TIMELINE_SIMILARITY,
    THRESHOLD_UNIQUE_TEXTS,
    UNIQUE_CRITERION_FEATURE,
 )
-from lang_main.pipelines.base import BasePipeline
+from lang_main.pipelines.base import Pipeline
 from lang_main.types import EntryPoints
 # ** pipeline configuration
 # ** target feature preparation
-pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
+def build_base_target_feature_pipe() -> Pipeline:
-pipe_target_feat.add(
+    pipe_target_feat = Pipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
-    load_raw_data,
+    pipe_target_feat.add(
-    {
+        load_raw_data,
-        'date_cols': DATE_COLS,
+        {
-    },
+            'date_cols': DATE_COLS,
-)
+        },
-pipe_target_feat.add(remove_duplicates)
+    )
-pipe_target_feat.add(remove_NA, save_result=True)
+    pipe_target_feat.add(remove_duplicates)
-pipe_target_feat.add(
+    pipe_target_feat.add(remove_NA, save_result=True)
-    entry_wise_cleansing,
+    pipe_target_feat.add(
-    {
+        entry_wise_cleansing,
-        'target_feature': 'VorgangsBeschreibung',
+        {
-        'cleansing_func': clean_string_slim,
+            'target_feature': 'VorgangsBeschreibung',
-    },
+            'cleansing_func': clean_string_slim,
-    save_result=True,
+        },
-)
+        save_result=True,
-pipe_target_feat.add(
+        filename=EntryPoints.TIMELINE,
-    analyse_feature,
+    )
-    {
+    pipe_target_feat.add(
-        'target_feature': 'VorgangsBeschreibung',
+        analyse_feature,
-    },
+        {
-    save_result=True,
+            'target_feature': 'VorgangsBeschreibung',
-)
+        },
        save_result=True,
    )
    return pipe_target_feat
 # output: DataFrame containing target feature with
 # number of occurrences and associated ObjectIDs
@ -81,68 +93,114 @@ pipe_target_feat.add(
 #     save_result=True,
 # )
 # ** Merge duplicates
-pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
+def build_merge_duplicates_pipe() -> Pipeline:
-# pipe_merge.add(merge_similarity_dupl, save_result=True)
+    pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
-pipe_merge.add(
+    # pipe_merge.add(merge_similarity_dupl, save_result=True)
-    merge_similarity_dupl,
+    pipe_merge.add(
-    {
+        numeric_pre_filter_feature,
-        'model': STFR_MODEL,
+        {
-        'cos_sim_threshold': THRESHOLD_SIMILARITY,
+            'feature': 'len',
-    },
+            'bound_lower': THRESHOLD_AMOUNT_CHARACTERS,
-    save_result=True,
+            'bound_upper': None,
-)
+        },
    )
    pipe_merge.add(
        merge_similarity_dupl,
        {
            'model': STFR_MODEL,
            'cos_sim_threshold': THRESHOLD_SIMILARITY,
        },
        save_result=True,
        filename=EntryPoints.TOKEN_ANALYSIS,
    )
    return pipe_merge
 # ** token analysis
-pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
+def build_tk_graph_pipe() -> Pipeline:
-pipe_token_analysis.add(
+    pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
-    build_token_graph,
+    pipe_token_analysis.add(
-    {
+        build_token_graph,
-        'model': SPCY_MODEL,
+        {
-        'target_feature': 'entry',
+            'model': SPCY_MODEL,
-        'weights_feature': 'num_occur',
+            'target_feature': 'entry',
-        'batch_idx_feature': 'batched_idxs',
+            'weights_feature': 'num_occur',
-        'build_map': True,
+            'batch_idx_feature': 'batched_idxs',
-        'batch_size_model': 50,
+            'build_map': False,
-    },
+            'batch_size_model': 50,
-    save_result=True,
+        },
-)
+        save_result=True,
        filename=EntryPoints.TK_GRAPH_POST,
    )
    return pipe_token_analysis
 def build_tk_graph_post_pipe() -> Pipeline:
    pipe_graph_postprocessing = Pipeline(
        name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
    )
    pipe_graph_postprocessing.add(
        graphs.filter_graph_by_edge_weight,
        {
            'bound_lower': THRESHOLD_EDGE_WEIGHT,
            'bound_upper': None,
        },
    )
    pipe_graph_postprocessing.add(
        graphs.filter_graph_by_node_degree,
        {
            'bound_lower': 1,
            'bound_upper': None,
        },
        save_result=True,
        filename=EntryPoints.TK_GRAPH_ANALYSIS,
    )
    return pipe_graph_postprocessing
 # ** timeline analysis
-pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
+def build_timeline_pipe() -> Pipeline:
-pipe_timeline.add(
+    pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
-    remove_non_relevant_obj_ids,
+    pipe_timeline.add(
-    {
+        remove_non_relevant_obj_ids,
-        'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
+        {
-        'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
+            'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+            'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
-    },
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
-    save_result=True,
+        },
-)
+        save_result=True,
-pipe_timeline.add(
+    )
-    generate_model_input,
+    pipe_timeline.add(
-    {
+        generate_model_input,
-        'target_feature_name': 'nlp_model_input',
+        {
-        'model_input_features': MODEL_INPUT_FEATURES,
+            'target_feature_name': 'nlp_model_input',
-    },
+            'model_input_features': MODEL_INPUT_FEATURES,
-)
+        },
-pipe_timeline.add(
+    )
-    filter_activities_per_obj_id,
+    pipe_timeline.add(
-    {
+        filter_activities_per_obj_id,
-        'activity_feature': ACTIVITY_FEATURE,
+        {
-        'relevant_activity_types': ACTIVITY_TYPES,
+            'activity_feature': ACTIVITY_FEATURE,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+            'relevant_activity_types': ACTIVITY_TYPES,
-        'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
-    },
+            'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
-)
+        },
-pipe_timeline.add(
+    )
-    get_timeline_candidates,
+    pipe_timeline.add(
-    {
+        get_timeline_candidates,
-        'model': STFR_MODEL,
+        {
-        'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
+            'model': STFR_MODEL,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+            'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
-        'model_input_feature': 'nlp_model_input',
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
-    },
+            'model_input_feature': 'nlp_model_input',
-    save_result=True,
+        },
-)
+        save_result=True,
        filename=EntryPoints.TIMELINE_POST,
    )
    return pipe_timeline
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -17,8 +17,20 @@ class LoggingLevels(enum.IntEnum):
 # ** devices
 class STFRDeviceTypes(enum.StrEnum):
-    CPU = 'cpu'
+    CPU = enum.auto()
-    GPU = 'cuda'
+    GPU = enum.auto()
 # ** pipelines
 ResultHandling: TypeAlias = list[tuple[bool, str | None]]
 class EntryPoints(enum.StrEnum):
    TIMELINE = 'TIMELINE'
    TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
    TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
    TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
    TOKEN_ANALYSIS = 'TOKEN_ANALYSIS'
 # ** datasets
--- a/test-notebooks/misc.ipynb
+++ b/test-notebooks/misc.ipynb