new pipeline management, proto graph display timeline

2024-06-19 16:58:26 +02:00 · 2024-06-19 16:58:26 +02:00 · fb4437a3a2
commit fb4437a3a2
parent c2714b8060
21 changed files with 2838 additions and 11383 deletions
--- a/pdm.lock
+++ b/pdm.lock
@ -5,7 +5,7 @@
 groups = ["default", "notebooks", "trials"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98"
+content_hash = "sha256:e00f157f833ee7615d96375c352e2caa6b4f6b50e5615ccbefa79446189594c7"

 [[package]]
 name = "annotated-types"
@ -2938,13 +2938,13 @@ files = [

 [[package]]
 name = "typing-extensions"
-version = "4.11.0"
+version = "4.12.2"
 requires_python = ">=3.8"
 summary = "Backported and Experimental Type Hints for Python 3.8+"
 groups = ["default", "notebooks", "trials"]
 files = [
-    {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
-    {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]

 [[package]]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ dependencies = [
    "sentence-transformers>=2.7.0",
    "numpy>=1.26.4",
    "pip>=24.0",
+    "typing-extensions>=4.12.2",
 ]
 requires-python = ">=3.11"
 readme = "README.md"
@ -47,4 +48,7 @@ quote-style = "single"
 skip-magic-trailing-comma = false

 [tool.ruff.lint]
-select = ["E", "F", "I"]
+select = ["E", "F", "I"]
+
+[tool.ruff.lint.isort]
+extra-standard-library = ["typing_extensions"]
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@ -1,42 +1,44 @@
 import typing
-import warnings
-from pathlib import Path
 from typing import cast

+from pandas import DataFrame, Series
+
 from lang_main.analysis.graphs import TokenGraph
 from lang_main.constants import (
-    DO_GRAPH_POSTPROCESSING,
-    DO_PREPROCESSING,
-    DO_TIME_ANALYSIS,
-    DO_TOKEN_ANALYSIS,
-    INPUT_PATH_FOLDER,
    PATH_TO_DATASET,
    SAVE_PATH_FOLDER,
    SKIP_GRAPH_POSTPROCESSING,
    SKIP_PREPROCESSING,
    SKIP_TIME_ANALYSIS,
    SKIP_TOKEN_ANALYSIS,
-    THRESHOLD_AMOUNT_CHARACTERS,
-    THRESHOLD_EDGE_WEIGHT,
 )
-from lang_main.io import create_saving_folder, load_pickle
+from lang_main.io import create_saving_folder, get_entry_point, load_pickle
+from lang_main.pipelines.base import PipelineContainer
 from lang_main.pipelines.predefined import (
-    pipe_merge,
-    pipe_target_feat,
-    pipe_timeline,
-    pipe_token_analysis,
+    build_base_target_feature_pipe,
+    build_merge_duplicates_pipe,
+    build_timeline_pipe,
+    build_tk_graph_pipe,
+    build_tk_graph_post_pipe,
 )
 from lang_main.types import (
+    EntryPoints,
    ObjectID,
    PandasIndex,
    SpacyDoc,
    TimelineCandidates,
 )
-from pandas import DataFrame, Series
+
+# ** build pipelines
+pipe_merge = build_merge_duplicates_pipe()
+pipe_target_feat = build_base_target_feature_pipe()
+pipe_timeline = build_timeline_pipe()
+pipe_token_analysis = build_tk_graph_pipe()
+pipe_graph_postprocessing = build_tk_graph_post_pipe()


-# ** processing pipeline
-def run_preprocessing() -> DataFrame:
+# ** preprocessing pipeline
+def run_preprocessing() -> None:
    create_saving_folder(
        saving_path_folder=SAVE_PATH_FOLDER,
        overwrite_existing=False,
@ -46,134 +48,69 @@ def run_preprocessing() -> DataFrame:
        tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
    )
    target_feat_data = ret[0]
-    # only entries with more than threshold amount of characters
-    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-    subset_data = target_feat_data.loc[data_filter].copy()
-    # merge duplicates, results saved separately
-    ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
-    preprocessed_data = ret[0]
-
-    return preprocessed_data
+    _ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))


-def run_token_analysis(
-    preprocessed_data: DataFrame,
-) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
+# ** token analysis
+def run_token_analysis() -> None:
+    # load entry point
+    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TOKEN_ANALYSIS)
+    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
+    preprocessed_data = loaded_results[0]
    # build token graph
    (tk_graph, docs_mapping) = typing.cast(
-        tuple[TokenGraph, dict[PandasIndex, SpacyDoc]],
+        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
        pipe_token_analysis.run(starting_values=(preprocessed_data,)),
    )
-    tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
-    tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
-
-    return tk_graph, docs_mapping
+    tk_graph.to_GraphML(SAVE_PATH_FOLDER, filename='TokenGraph', directed=False)


-def run_graph_postprocessing(
-    tk_graph: TokenGraph,
-) -> TokenGraph:
+def run_graph_postprocessing() -> None:
+    # load entry point
+    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TK_GRAPH_POST)
+    loaded_results = cast(
+        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
+        load_pickle(entry_point_path),
+    )
+    tk_graph = loaded_results[0]
    # filter graph by edge weight and remove single nodes (no connection)
-    tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
-    tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
-    tk_graph_filtered.save_graph(
+    ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
+    tk_graph_filtered = ret[0]
+    # tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT, None)
+    # tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
+    tk_graph_filtered.to_GraphML(
        SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
    )
-    tk_graph_filtered.to_pickle(
-        SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
-    )
-
-    return tk_graph_filtered


-def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
-    filename = 'without_nan'
-    loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
-    verify_path(loading_path)
-    ret = load_pickle(loading_path)
-    preprocessed_data = ret[0]
+# ** time analysis
+def run_time_analysis() -> None:
+    # load entry point
+    entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
+    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
+    preprocessed_data = loaded_results[0]

-    ret = cast(
+    _ = cast(
        tuple[TimelineCandidates, dict[ObjectID, str]],
        pipe_timeline.run(starting_values=(preprocessed_data,)),
    )
-    return ret


-def verify_path(
-    loading_path: Path,
-) -> None:
-    if not loading_path.exists():
-        raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
+def build_pipeline_container() -> PipelineContainer:
+    container = PipelineContainer(
+        name='Pipeline-Container-Base', working_dir=SAVE_PATH_FOLDER
+    )
+    container.add(run_preprocessing, skip=SKIP_PREPROCESSING)
+    container.add(run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
+    container.add(run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
+    container.add(run_time_analysis, skip=SKIP_TIME_ANALYSIS)
+
+    return container


 def main() -> None:
-    pre_step_skipped: bool = False
-    # ** preprocess
-    if DO_PREPROCESSING and not SKIP_PREPROCESSING:
-        preprocessed_data = run_preprocessing()
-    elif not SKIP_PREPROCESSING:
-        # !! hardcoded result filenames
-        target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
-        loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
-        verify_path(loading_path)
-        ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
-        preprocessed_data = ret[0]
-    else:
-        pre_step_skipped = True
-        warnings.warn('No preprocessing action selected. Skipped.')
-    # sys.exit(0)
-    # ** token analysis
-    if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
-        if pre_step_skipped:
-            raise RuntimeError(
-                'Preprocessing step skipped. Token analysis cannot be performed.'
-            )
-        preprocessed_data_trunc = typing.cast(
-            DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
-        )  # type: ignore
-        tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
-    elif not SKIP_TOKEN_ANALYSIS:
-        # !! hardcoded result filenames
-        # whole graph
-        filename: str = f'{pipe_token_analysis.name}-TokenGraph'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
-        verify_path(loading_path)
-        # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
-        tk_graph = TokenGraph.from_pickle(loading_path)
-        pre_step_skipped = False
-    else:
-        pre_step_skipped = True
-        warnings.warn('No token analysis action selected. Skipped.')
-    # ** graph postprocessing
-    if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
-        if pre_step_skipped:
-            raise RuntimeError(
-                (
-                    'Preprocessing or token analysis step skipped. '
-                    'Graph postprocessing cannot be performed.'
-                )
-            )
-        tk_graph_filtered = run_graph_postprocessing(tk_graph)
-    elif not SKIP_GRAPH_POSTPROCESSING:
-        # !! hardcoded result filenames
-        # filtered graph
-        filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
-        verify_path(loading_path)
-        # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
-        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
-        pre_step_skipped = False
-    else:
-        warnings.warn('No graph postprocessing action selected. Skipped.')
-    # ** time analysis
-    if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
-        # no check for fails, runs separately
-        ret = run_time_analysis()
-    elif not SKIP_TIME_ANALYSIS:
-        ...
-    else:
-        warnings.warn('No time analysis action selected. Skipped.')
+    procedure = build_pipeline_container()
+    procedure.run()


 if __name__ == '__main__':
--- a/scripts/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/scripts/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/scripts/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/scripts/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/scripts/dashboard/app.py
+++ b/scripts/dashboard/app.py
@ -1,190 +0,0 @@
-import time
-import webbrowser
-from pathlib import Path
-from threading import Thread
-from typing import cast
-
-import pandas as pd
-import plotly.express as px
-from dash import (
-    Dash,
-    Input,
-    Output,
-    State,
-    callback,
-    dash_table,
-    dcc,
-    html,
-)
-from lang_main.io import load_pickle
-from lang_main.types import ObjectID, TimelineCandidates
-from pandas import DataFrame
-
-# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
-
-# ** data
-p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
-p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
-ret = cast(DataFrame, load_pickle(p_df))
-data = ret[0]
-ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
-cands = ret[0]
-texts = ret[1]
-
-# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
-# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
-# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
-# data = cast(DataFrame, load_pickle(p_df))
-# cands = cast(TimelineCandidates, load_pickle(p_cands))
-# texts = cast(dict[ObjectID, str], load_pickle(p_map))
-
-table_feats = [
-    'ErstellungsDatum',
-    'ErledigungsDatum',
-    'VorgangsTypName',
-    'VorgangsBeschreibung',
-]
-table_feats_dates = [
-    'ErstellungsDatum',
-    'ErledigungsDatum',
-]
-
-# ** graph config
-markers = {
-    'size': 12,
-    'color': 'yellow',
-    'line': {
-        'width': 2,
-        'color': 'red',
-    },
-}
-hover_data = {
-    'ErstellungsDatum': '|%d.%m.%Y',
-    'VorgangsBeschreibung': True,
-}
-
-
-app = Dash(prevent_initial_callbacks=True)
-
-app.layout = [
-    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
-    html.Div(
-        children=[
-            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
-            dcc.Dropdown(
-                list(cands.keys()),
-                id='dropdown-selection',
-                placeholder='ObjektID auswählen...',
-            ),
-        ]
-    ),
-    html.Div(
-        children=[
-            html.H3(id='object_text'),
-            dcc.Dropdown(id='choice-candidates'),
-            dcc.Graph(id='graph-output'),
-        ]
-    ),
-    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
-]
-
-
-@callback(
-    Output('object_text', 'children'),
-    Input('dropdown-selection', 'value'),
-    prevent_initial_call=True,
-)
-def update_obj_text(obj_id):
-    obj_id = int(obj_id)
-    obj_text = texts[obj_id]
-    headline = f'HObjektText: {obj_text}'
-    return headline
-
-
-@callback(
-    Output('choice-candidates', 'options'),
-    Input('dropdown-selection', 'value'),
-    prevent_initial_call=True,
-)
-def update_choice_candidates(obj_id):
-    obj_id = int(obj_id)
-    cands_obj_id = cands[obj_id]
-    choices = list(range(1, len(cands_obj_id) + 1))
-    return choices
-
-
-@callback(
-    Output('graph-output', 'figure'),
-    Input('choice-candidates', 'value'),
-    State('dropdown-selection', 'value'),
-    prevent_initial_call=True,
-)
-def update_timeline(index, obj_id):
-    obj_id = int(obj_id)
-    # title
-    obj_text = texts[obj_id]
-    title = f'HObjektText: {obj_text}'
-    # cands
-    cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index) - 1]
-    # data
-    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
-    # figure
-    fig = px.line(
-        data_frame=df,
-        x='ErstellungsDatum',
-        y='ObjektID',
-        title=title,
-        hover_data=hover_data,
-    )
-    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
-    fig.update_xaxes(
-        tickformat='%B\n%Y',
-        rangeslider_visible=True,
-    )
-    fig.update_yaxes(type='category')
-    fig.update_layout(hovermode='x unified')
-    return fig
-
-
-@callback(
-    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
-    Input('choice-candidates', 'value'),
-    State('dropdown-selection', 'value'),
-    prevent_initial_call=True,
-)
-def update_table_candidates(index, obj_id):
-    obj_id = int(obj_id)
-    # cands
-    cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index) - 1]
-    # data
-    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
-    df = df.filter(items=table_feats, axis=1).sort_values(
-        by='ErstellungsDatum', ascending=True
-    )
-    cols = [{'name': i, 'id': i} for i in df.columns]
-    # convert dates to strings
-    for col in table_feats_dates:
-        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
-
-    table_data = df.to_dict('records')
-    return table_data, cols
-
-
-def _start_webbrowser():
-    host = '127.0.0.1'
-    port = '8050'
-    adress = f'http://{host}:{port}/'
-    time.sleep(2)
-    webbrowser.open_new(adress)
-
-
-def main():
-    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
-    webbrowser_thread.start()
-    app.run(debug=True)
-
-
-if __name__ == '__main__':
-    main()
--- a/scripts/dashboard/cyto.py
+++ b/scripts/dashboard/cyto.py
@ -1,9 +1,9 @@
+import copy
 import time
 import webbrowser
 from pathlib import Path
 from threading import Thread
 from typing import cast
-import copy

 import dash_cytoscape as cyto
 from dash import Dash, Input, Output, State, dcc, html
@ -30,20 +30,20 @@ app = Dash(__name__, external_stylesheets=external_stylesheets)

 cose_layout = {
    'name': 'cose',
-    'nodeOverlap': 20,
+    'nodeOverlap': 500,
    'refresh': 20,
    'fit': True,
-    'padding': 30,
-    'randomize': True,
-    'componentSpacing': 40,
-    'nodeRepulsion': 2000,
+    'padding': 20,
+    'randomize': False,
+    'componentSpacing': 1.2,
+    'nodeRepulsion': 1000,
    'edgeElasticity': 1000,
    'idealEdgeLength': 100,
    'nestingFactor': 1.2,
    'gravity': 50,
-    'numIter': 2000,
-    'initialTemp': 1000,
-    'coolingFactor': 0.95,
+    'numIter': 3000,
+    'initialTemp': 2000,
+    'coolingFactor': 0.7,
    'minTemp': 1.0,
    'nodeDimensionsIncludeLabels': True,
 }
@ -108,9 +108,8 @@ my_stylesheet = [
    # {'selector': '.triangle', 'style': {'shape': 'triangle'}},
 ]

-app.layout = html.Div(
+layout = html.Div(
    [
-        html.Button('Trigger JS Layout', id='test_js'),
        html.Button('Trigger JS Weight', id='test_js_weight'),
        html.Div(id='output'),
        html.Div(
@ -166,11 +165,13 @@ app.layout = html.Div(
                    style={'width': '40%'},
                ),
                html.H3('Graph'),
+                html.Button('Re-Layout', id='trigger_relayout'),
                html.Div(
                    [
                        cyto.Cytoscape(
                            id='cytoscape-graph',
                            style={'width': '100%', 'height': '600px'},
+                            layout=cose_layout,
                            stylesheet=my_stylesheet,
                            elements=cyto_data_base,
                            zoom=1,
@ -192,6 +193,9 @@ app.layout = html.Div(
 )


+app.layout = layout
+
+
@app.callback(
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
    Input('layout_choice', 'value'),
@ -266,17 +270,17 @@ app.clientside_callback(
    """
    function(n_clicks, layout) {
        layout.edgeElasticity = function(edge) {
-            return edge.data().weight * 4;
+            return edge.data().weight * 0.05;
        };
        layout.idealEdgeLength = function(edge) {
-            return edge.data().weight * 0.8;
+            return edge.data().weight * 0.4;
        };
        cy.layout(layout).run();
        return layout;
    }
    """,
    Output('cytoscape-graph', 'layout', allow_duplicate=True),
-    Input('test_js', 'n_clicks'),
+    Input('trigger_relayout', 'n_clicks'),
    State('cytoscape-graph', 'layout'),
    prevent_initial_call=True,
 )
--- a/scripts/dashboard/cyto_2.py
+++ b/scripts/dashboard/cyto_2.py
@ -1,368 +0,0 @@
-import json
-import os
-
-import dash
-import dash_cytoscape as cyto
-from dash import Input, Output, State, callback, dcc, html
-
-# Load extra layouts
-cyto.load_extra_layouts()
-
-
-# Display utility functions
-def _merge(a, b):
-    return dict(a, **b)
-
-
-def _omit(omitted_keys, d):
-    return {k: v for k, v in d.items() if k not in omitted_keys}
-
-
-# Custom Display Components
-def Card(children, **kwargs):
-    return html.Section(
-        children,
-        style=_merge(
-            {
-                'padding': 20,
-                'margin': 5,
-                'borderRadius': 5,
-                'border': 'thin lightgrey solid',
-                'background-color': 'white',
-                # Remove possibility to select the text for better UX
-                'user-select': 'none',
-                '-moz-user-select': 'none',
-                '-webkit-user-select': 'none',
-                '-ms-user-select': 'none',
-            },
-            kwargs.get('style', {}),
-        ),
-        **_omit(['style'], kwargs),
-    )
-
-
-def SectionTitle(title, size, align='center', color='#222'):
-    return html.Div(
-        style={'text-align': align, 'color': color},
-        children=dcc.Markdown('#' * size + ' ' + title),
-    )
-
-
-def NamedCard(title, size, children, **kwargs):
-    size = min(size, 6)
-    size = max(size, 1)
-
-    return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
-
-
-def NamedSlider(name, **kwargs):
-    return html.Div(
-        style={'padding': '20px 10px 25px 4px'},
-        children=[
-            html.P(f'{name}:'),
-            html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
-        ],
-    )
-
-
-def NamedDropdown(name, **kwargs):
-    return html.Div(
-        style={'margin': '10px 0px'},
-        children=[
-            html.P(children=f'{name}:', style={'margin-left': '3px'}),
-            dcc.Dropdown(**kwargs),
-        ],
-    )
-
-
-def NamedRadioItems(name, **kwargs):
-    return html.Div(
-        style={'padding': '20px 10px 25px 4px'},
-        children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
-    )
-
-
-def NamedInput(name, **kwargs):
-    return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
-
-
-# Utils
-def DropdownOptionsList(*args):
-    return [{'label': val.capitalize(), 'value': val} for val in args]
-
-
-asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
-
-app = dash.Dash(__name__, assets_folder=asset_path)
-server = app.server
-
-
-# ###################### DATA PREPROCESSING ######################
-# Load data
-with open('sample_network.txt', 'r', encoding='utf-8') as f:
-    network_data = f.read().split('\n')
-
-# We select the first 750 edges and associated nodes for an easier visualization
-edges = network_data[:750]
-nodes = set()
-
-following_node_di = {}  # user id -> list of users they are following
-following_edges_di = {}  # user id -> list of cy edges starting from user id
-
-followers_node_di = {}  # user id -> list of followers (cy_node format)
-followers_edges_di = {}  # user id -> list of cy edges ending at user id
-
-cy_edges = []
-cy_nodes = []
-
-for edge in edges:
-    if ' ' not in edge:
-        continue
-
-    source, target = edge.split(' ')
-
-    cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
-    cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
-    cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
-
-    if source not in nodes:
-        nodes.add(source)
-        cy_nodes.append(cy_source)
-    if target not in nodes:
-        nodes.add(target)
-        cy_nodes.append(cy_target)
-
-    # Process dictionary of following
-    if not following_node_di.get(source):
-        following_node_di[source] = []
-    if not following_edges_di.get(source):
-        following_edges_di[source] = []
-
-    following_node_di[source].append(cy_target)
-    following_edges_di[source].append(cy_edge)
-
-    # Process dictionary of followers
-    if not followers_node_di.get(target):
-        followers_node_di[target] = []
-    if not followers_edges_di.get(target):
-        followers_edges_di[target] = []
-
-    followers_node_di[target].append(cy_source)
-    followers_edges_di[target].append(cy_edge)
-
-genesis_node = cy_nodes[0]
-genesis_node['classes'] = 'genesis'
-default_elements = [genesis_node]
-
-default_stylesheet = [
-    {'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
-    {
-        'selector': 'edge',
-        'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
-    },
-    {'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
-    {
-        'selector': '.followerEdge',
-        'style': {
-            'mid-target-arrow-color': 'blue',
-            'mid-target-arrow-shape': 'vee',
-            'line-color': '#0074D9',
-        },
-    },
-    {'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
-    {
-        'selector': '.followingEdge',
-        'style': {
-            'mid-target-arrow-color': 'red',
-            'mid-target-arrow-shape': 'vee',
-            'line-color': '#FF4136',
-        },
-    },
-    {
-        'selector': '.genesis',
-        'style': {
-            'background-color': '#B10DC9',
-            'border-width': 2,
-            'border-color': 'purple',
-            'border-opacity': 1,
-            'opacity': 1,
-            'label': 'data(label)',
-            'color': '#B10DC9',
-            'text-opacity': 1,
-            'font-size': 12,
-            'z-index': 9999,
-        },
-    },
-    {
-        'selector': ':selected',
-        'style': {
-            'border-width': 2,
-            'border-color': 'black',
-            'border-opacity': 1,
-            'opacity': 1,
-            'label': 'data(label)',
-            'color': 'black',
-            'font-size': 12,
-            'z-index': 9999,
-        },
-    },
-]
-
-# ################################# APP LAYOUT ################################
-styles = {
-    'json-output': {
-        'overflow-y': 'scroll',
-        'height': 'calc(50% - 25px)',
-        'border': 'thin lightgrey solid',
-    },
-    'tab': {'height': 'calc(98vh - 80px)'},
-}
-
-app.layout = html.Div(
-    [
-        html.Div(
-            className='eight columns',
-            children=[
-                cyto.Cytoscape(
-                    id='cytoscape',
-                    elements=default_elements,
-                    stylesheet=default_stylesheet,
-                    style={'height': '95vh', 'width': '100%'},
-                )
-            ],
-        ),
-        html.Div(
-            className='four columns',
-            children=[
-                dcc.Tabs(
-                    id='tabs',
-                    children=[
-                        dcc.Tab(
-                            label='Control Panel',
-                            children=[
-                                NamedDropdown(
-                                    name='Layout',
-                                    id='dropdown-layout',
-                                    options=DropdownOptionsList(
-                                        'random',
-                                        'grid',
-                                        'circle',
-                                        'concentric',
-                                        'breadthfirst',
-                                        'cose',
-                                        'cose-bilkent',
-                                        'dagre',
-                                        'cola',
-                                        'klay',
-                                        'spread',
-                                        'euler',
-                                    ),
-                                    value='grid',
-                                    clearable=False,
-                                ),
-                                NamedRadioItems(
-                                    name='Expand',
-                                    id='radio-expand',
-                                    options=DropdownOptionsList('followers', 'following'),
-                                    value='followers',
-                                ),
-                            ],
-                        ),
-                        dcc.Tab(
-                            label='JSON',
-                            children=[
-                                html.Div(
-                                    style=styles['tab'],
-                                    children=[
-                                        html.P('Node Object JSON:'),
-                                        html.Pre(
-                                            id='tap-node-json-output',
-                                            style=styles['json-output'],
-                                        ),
-                                        html.P('Edge Object JSON:'),
-                                        html.Pre(
-                                            id='tap-edge-json-output',
-                                            style=styles['json-output'],
-                                        ),
-                                    ],
-                                )
-                            ],
-                        ),
-                    ],
-                ),
-            ],
-        ),
-    ]
-)
-
-
-# ############################## CALLBACKS ####################################
-@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
-def display_tap_node(data):
-    return json.dumps(data, indent=2)
-
-
-@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
-def display_tap_edge(data):
-    return json.dumps(data, indent=2)
-
-
-@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
-def update_cytoscape_layout(layout):
-    return {'name': layout}
-
-
-@callback(
-    Output('cytoscape', 'elements'),
-    Input('cytoscape', 'tapNodeData'),
-    State('cytoscape', 'elements'),
-    State('radio-expand', 'value'),
-)
-def generate_elements(nodeData, elements, expansion_mode):
-    if not nodeData:
-        return default_elements
-
-    # If the node has already been expanded, we don't expand it again
-    if nodeData.get('expanded'):
-        return elements
-
-    # This retrieves the currently selected element, and tag it as expanded
-    for element in elements:
-        if nodeData['id'] == element.get('data').get('id'):
-            element['data']['expanded'] = True
-            break
-
-    if expansion_mode == 'followers':
-        followers_nodes = followers_node_di.get(nodeData['id'])
-        followers_edges = followers_edges_di.get(nodeData['id'])
-
-        if followers_nodes:
-            for node in followers_nodes:
-                node['classes'] = 'followerNode'
-            elements.extend(followers_nodes)
-
-        if followers_edges:
-            for follower_edge in followers_edges:
-                follower_edge['classes'] = 'followerEdge'
-            elements.extend(followers_edges)
-
-    elif expansion_mode == 'following':
-        following_nodes = following_node_di.get(nodeData['id'])
-        following_edges = following_edges_di.get(nodeData['id'])
-
-        if following_nodes:
-            for node in following_nodes:
-                if node['data']['id'] != genesis_node['data']['id']:
-                    node['classes'] = 'followingNode'
-                    elements.append(node)
-
-        if following_edges:
-            for follower_edge in following_edges:
-                follower_edge['classes'] = 'followingEdge'
-            elements.extend(following_edges)
-
-    return elements
-
-
-if __name__ == '__main__':
-    app.run_server(debug=True)
--- a/scripts/dashboard/sample_network.txt
+++ b/scripts/dashboard/sample_network.txt
--- a/scripts/dashboard/timeline.py
+++ b/scripts/dashboard/timeline.py
@ -0,0 +1,507 @@
+import time
+import webbrowser
+from pathlib import Path
+from threading import Thread
+from typing import cast
+
+import dash_cytoscape as cyto
+import pandas as pd
+import plotly.express as px
+from dash import (
+    Dash,
+    Input,
+    Output,
+    State,
+    callback,
+    dash_table,
+    dcc,
+    html,
+)
+from pandas import DataFrame
+
+from lang_main.analysis import graphs
+from lang_main.io import load_pickle
+from lang_main.types import ObjectID, TimelineCandidates
+from lang_main.analysis import tokens
+from lang_main.constants import SPCY_MODEL
+
+# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
+
+# ** data
+# p_df = Path(r'../Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
+p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
+# p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
+p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
+ret = cast(tuple[DataFrame], load_pickle(p_df))
+data = ret[0]
+ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
+cands = ret[0]
+texts = ret[1]
+
+# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
+# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
+# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
+# data = cast(DataFrame, load_pickle(p_df))
+# cands = cast(TimelineCandidates, load_pickle(p_cands))
+# texts = cast(dict[ObjectID, str], load_pickle(p_map))
+
+table_feats = [
+    'ErstellungsDatum',
+    'ErledigungsDatum',
+    'VorgangsTypName',
+    'VorgangsBeschreibung',
+]
+table_feats_dates = [
+    'ErstellungsDatum',
+    'ErledigungsDatum',
+]
+
+# ** figure config
+markers = {
+    'size': 12,
+    'color': 'yellow',
+    'line': {
+        'width': 2,
+        'color': 'red',
+    },
+}
+hover_data = {
+    'ErstellungsDatum': '|%d.%m.%Y',
+    'VorgangsBeschreibung': True,
+}
+
+# ** graphs
+target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
+p = Path(target).resolve()
+ret = load_pickle(p)
+tk_graph = cast(graphs.TokenGraph, ret[0])
+tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, 150, None)
+tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
+# tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None)
+# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
+cyto_data_base, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
+
+MIN_WEIGHT = weight_data['min']
+MAX_WEIGHT = weight_data['max']
+
+cyto.load_extra_layouts()
+
+cose_layout = {
+    'name': 'cose',
+    'nodeOverlap': 500,
+    'refresh': 20,
+    'fit': True,
+    'padding': 20,
+    'randomize': False,
+    'componentSpacing': 1.2,
+    'nodeRepulsion': 1000,
+    'edgeElasticity': 1000,
+    'idealEdgeLength': 100,
+    'nestingFactor': 1.2,
+    'gravity': 50,
+    'numIter': 3000,
+    'initialTemp': 2000,
+    'coolingFactor': 0.7,
+    'minTemp': 1.0,
+    'nodeDimensionsIncludeLabels': True,
+}
+
+my_stylesheet = [
+    # Group selectors
+    {
+        'selector': 'node',
+        'style': {
+            'shape': 'circle',
+            'content': 'data(label)',
+            'background-color': '#B10DC9',
+            'border-width': 2,
+            'border-color': 'black',
+            'border-opacity': 1,
+            'opacity': 1,
+            'color': 'black',
+            'text-opacity': 1,
+            'font-size': 12,
+            'z-index': 9999,
+        },
+    },
+    {
+        'selector': 'edge',
+        'style': {
+            #'width': f'mapData(weight, {MIN_WEIGHT}, {MAX_WEIGHT}, 1, 10)',
+            # 'width': """function(ele) {
+            #     return ele.data('weight');
+            # """,
+            'curve-style': 'bezier',
+            'line-color': 'grey',
+            'line-style': 'solid',
+            'line-opacity': 1,
+        },
+    },
+    # Class selectors
+    # {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
+    # {'selector': '.triangle', 'style': {'shape': 'triangle'}},
+]
+
+# ** app
+external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
+app = Dash(__name__, external_stylesheets=external_stylesheets)
+
+graph_layout = html.Div(
+    [
+        html.Button('Trigger JS Weight', id='test_js_weight'),
+        html.Button('Trigger Candidate Graph', id='cand_graph'),
+        html.Div(id='output'),
+        html.Div(
+            [
+                html.H2('Token Graph', style={'margin': 0}),
+                html.Button(
+                    'Reset Default',
+                    id='bt-reset',
+                    style={
+                        'marginLeft': 'auto',
+                        'width': '300px',
+                    },
+                ),
+            ],
+            style={
+                'display': 'flex',
+                'marginBottom': '1em',
+            },
+        ),
+        html.H3('Layout'),
+        dcc.Dropdown(
+            id='layout_choice',
+            options=[
+                'cose',
+                'cola',
+                'euler',
+                'random',
+            ],
+            value='cose',
+            clearable=False,
+        ),
+        html.Div(
+            [
+                html.H3('Graph Filter'),
+                dcc.Input(
+                    id='weight_min',
+                    type='number',
+                    min=MIN_WEIGHT,
+                    max=MAX_WEIGHT,
+                    step=1,
+                    placeholder=f'Minimum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
+                    debounce=True,
+                    style={'width': '40%'},
+                ),
+                dcc.Input(
+                    id='weight_max',
+                    type='number',
+                    min=MIN_WEIGHT,
+                    max=MAX_WEIGHT,
+                    step=1,
+                    placeholder=f'Maximum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
+                    debounce=True,
+                    style={'width': '40%'},
+                ),
+                html.H3('Graph'),
+                html.Button('Re-Layout', id='trigger_relayout'),
+                html.Div(
+                    [
+                        cyto.Cytoscape(
+                            id='cytoscape-graph',
+                            style={'width': '100%', 'height': '600px'},
+                            layout=cose_layout,
+                            stylesheet=my_stylesheet,
+                            elements=cyto_data_base,
+                            zoom=1,
+                        ),
+                    ],
+                    style={
+                        'border': '3px solid black',
+                        'borderRadius': '25px',
+                        'marginTop': '1em',
+                        'marginBottom': '2em',
+                        'padding': '7px',
+                    },
+                ),
+            ],
+            style={'marginTop': '1em'},
+        ),
+    ],
+)
+
+app.layout = html.Div(
+    [
+        html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
+        html.Div(
+            children=[
+                html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
+                dcc.Dropdown(
+                    list(cands.keys()),
+                    id='dropdown-selection',
+                    placeholder='ObjektID auswählen...',
+                ),
+            ]
+        ),
+        html.Div(
+            children=[
+                html.H3(id='object_text'),
+                dcc.Dropdown(id='choice-candidates'),
+                dcc.Graph(id='graph-output'),
+            ]
+        ),
+        html.Div(
+            [dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
+        ),
+        graph_layout,
+    ],
+    style={'margin': '2em'},
+)
+
+
+@callback(
+    Output('object_text', 'children'),
+    Input('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_obj_text(obj_id):
+    obj_id = int(obj_id)
+    obj_text = texts[obj_id]
+    headline = f'HObjektText: {obj_text}'
+    return headline
+
+
+@callback(
+    Output('choice-candidates', 'options'),
+    Input('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_choice_candidates(obj_id):
+    obj_id = int(obj_id)
+    cands_obj_id = cands[obj_id]
+    choices = list(range(1, len(cands_obj_id) + 1))
+    return choices
+
+
+@callback(
+    Output('graph-output', 'figure'),
+    Input('choice-candidates', 'value'),
+    State('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_timeline(index, obj_id):
+    obj_id = int(obj_id)
+    # title
+    obj_text = texts[obj_id]
+    title = f'HObjektText: {obj_text}'
+    # cands
+    cands_obj_id = cands[obj_id]
+    cands_choice = cands_obj_id[int(index) - 1]
+    # data
+    df = data.loc[list(cands_choice)].sort_index()  # type: ignore
+    # figure
+    fig = px.line(
+        data_frame=df,
+        x='ErstellungsDatum',
+        y='ObjektID',
+        title=title,
+        hover_data=hover_data,
+    )
+    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
+    fig.update_xaxes(
+        tickformat='%B\n%Y',
+        rangeslider_visible=True,
+    )
+    fig.update_yaxes(type='category')
+    fig.update_layout(hovermode='x unified')
+    return fig
+
+
+@callback(
+    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
+    Input('choice-candidates', 'value'),
+    State('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_table_candidates(index, obj_id):
+    # obj_id = int(obj_id)
+    # # cands
+    # cands_obj_id = cands[obj_id]
+    # cands_choice = cands_obj_id[int(index) - 1]
+    # # data
+    # df = data.loc[list(cands_choice)].sort_index()  # type: ignore
+    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    df = df.filter(items=table_feats, axis=1).sort_values(
+        by='ErstellungsDatum', ascending=True
+    )
+    cols = [{'name': i, 'id': i} for i in df.columns]
+    # convert dates to strings
+    for col in table_feats_dates:
+        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
+
+    table_data = df.to_dict('records')
+    return table_data, cols
+
+
+def pre_filter_data(
+    data: DataFrame,
+    idx: int,
+    obj_id: ObjectID,
+) -> DataFrame:
+    obj_id = int(obj_id)
+    data = data.copy()
+    # cands
+    cands_obj_id = cands[obj_id]
+    cands_choice = cands_obj_id[int(idx) - 1]
+    # data
+    data = data.loc[list(cands_choice)].sort_index()  # type: ignore
+
+    return data
+
+
+# ** graph
+@app.callback(
+    Output('cytoscape-graph', 'elements', allow_duplicate=True),
+    Output('weight_min', 'min', allow_duplicate=True),
+    Output('weight_min', 'max', allow_duplicate=True),
+    Output('weight_min', 'placeholder', allow_duplicate=True),
+    Output('weight_max', 'min', allow_duplicate=True),
+    Output('weight_max', 'max', allow_duplicate=True),
+    Output('weight_max', 'placeholder', allow_duplicate=True),
+    Input('cand_graph', 'n_clicks'),
+    State('choice-candidates', 'value'),
+    State('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_graph_candidates(_, index, obj_id):
+    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    tk_graph_cands, _ = tokens.build_token_graph(
+        data=df,
+        model=SPCY_MODEL,
+        target_feature='VorgangsBeschreibung',
+        build_map=False,
+    )
+    cyto_data, weight_info = graphs.convert_graph_to_cytoscape(tk_graph_cands)
+    weight_min = weight_info['min']
+    weight_max = weight_info['max']
+    placeholder_min = f'Minimum edge weight: {weight_min} - {weight_max}'
+    placeholder_max = f'Minimum edge weight: {weight_min} - {weight_max}'
+    return (
+        cyto_data,
+        weight_min,
+        weight_max,
+        placeholder_min,
+        weight_min,
+        weight_max,
+        placeholder_max,
+    )
+
+
+@app.callback(
+    Output('cytoscape-graph', 'layout', allow_duplicate=True),
+    Input('layout_choice', 'value'),
+    prevent_initial_call=True,
+)
+def update_layout_internal(layout_choice):
+    # return {'name': layout_choice}
+    return cose_layout
+    # return cose_bilkent_layout
+    # return cola_layout
+
+
+@app.callback(
+    Output('cytoscape-graph', 'zoom'),
+    Output('cytoscape-graph', 'elements', allow_duplicate=True),
+    Output('weight_min', 'value'),
+    Output('weight_max', 'value'),
+    Input('bt-reset', 'n_clicks'),
+    prevent_initial_call=True,
+)
+def reset_layout(n_clicks):
+    return (1, cyto_data_base, None, None)
+
+
+# update edge weight
+@app.callback(
+    Output('cytoscape-graph', 'elements', allow_duplicate=True),
+    Input('weight_min', 'value'),
+    Input('weight_max', 'value'),
+    prevent_initial_call=True,
+)
+def update_edge_weight(weight_min, weight_max):
+    if not any([weight_min, weight_max]):
+        return cyto_data_base
+
+    if weight_min is None:
+        weight_min = MIN_WEIGHT
+    if weight_max is None:
+        weight_max = MAX_WEIGHT
+    tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, weight_min, weight_max)
+    # tk_graph_filtered = tk_graph.filter_by_edge_weight(weight_min, weight_max)
+    tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
+    # tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
+    cyto_data, _ = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
+    return cyto_data
+
+
+app.clientside_callback(
+    """
+    function(n_clicks, layout) {
+        layout.edgeElasticity = function(edge) {
+            return edge.data().weight * 0.05;
+        };
+        layout.idealEdgeLength = function(edge) {
+            return edge.data().weight * 0.4;
+        };
+        cy.layout(layout).run();
+        return layout;
+    }
+    """,
+    Output('cytoscape-graph', 'layout', allow_duplicate=True),
+    Input('trigger_relayout', 'n_clicks'),
+    State('cytoscape-graph', 'layout'),
+    prevent_initial_call=True,
+)
+
+app.clientside_callback(
+    """
+    function(n_clicks, stylesheet) {
+        function edge_weight(ele) {
+            let threshold = 1000;
+            let weight = ele.data('weight');
+            if (weight > threshold) {
+                weight = 12;
+            } else {
+                weight = weight / threshold * 10;
+                weight = Math.max(1, weight);
+            }
+            return weight;
+        }
+        stylesheet[1].style.width = edge_weight;
+        cy.style(stylesheet).update();
+        return stylesheet;
+    }
+    """,
+    Output('cytoscape-graph', 'stylesheet'),
+    Input('test_js_weight', 'n_clicks'),
+    State('cytoscape-graph', 'stylesheet'),
+    prevent_initial_call=False,
+)
+
+
+def _start_webbrowser():
+    host = '127.0.0.1'
+    port = '8050'
+    adress = f'http://{host}:{port}/'
+    time.sleep(2)
+    webbrowser.open_new(adress)
+
+
+def main():
+    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
+    webbrowser_thread.start()
+    app.run(debug=True)
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/lang_main_config
+++ b/scripts/lang_main_config
@ -1,38 +0,0 @@
-# lang_main: Config file
-
-[paths]
-inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
-results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
-dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
-#results = './results/Export7/'
-#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
-#results = './results/Export7_trunc/'
-#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
-
-[control]
-preprocessing = true
-preprocessing_skip = false
-token_analysis = false
-token_analysis_skip = true
-graph_postprocessing = false
-graph_postprocessing_skip = true
-
-#[export_filenames]
-#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
-
-[preprocess]
-filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
-date_cols = [
-    "VorgangsDatum", 
-    "ErledigungsDatum", 
-    "Arbeitsbeginn", 
-    "ErstellungsDatum",
-]
-threshold_amount_characters = 5
-threshold_similarity = 0.8
-
-[graph_postprocessing]
-threshold_edge_weight = 150
-
-[time_analysis]
-threshold_unique_texts = 5
--- a/scripts/lang_main_config.toml
+++ b/scripts/lang_main_config.toml
@ -2,22 +2,20 @@

 [paths]
 inputs = './inputs/'
-results = './results/test_20240529/'
+results = './results/test_20240619/'
 dataset = '../data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'

+# only debugging features, production-ready pipelines should always
+# be fully executed
 [control]
-preprocessing = false
-preprocessing_skip = false
-token_analysis = true
-token_analysis_skip = false
-graph_postprocessing = false
+preprocessing_skip = true
+token_analysis_skip = true
 graph_postprocessing_skip = true
-time_analysis = false
-time_analysis_skip = true
+time_analysis_skip = false

 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import copy
 import sys
 import typing
@ -169,6 +171,90 @@ def convert_graph_to_cytoscape(
    return cyto_data, weight_metadata


+def filter_graph_by_edge_weight(
+    graph: TokenGraph,
+    bound_lower: int | None,
+    bound_upper: int | None,
+) -> TokenGraph:
+    """filters all edges which are within the provided bounds
+
+    Parameters
+    ----------
+    bound_lower : int | None
+        lower bound for edge weights, edges with weight equal to this value are retained
+    bound_upper : int | None
+        upper bound for edge weights, edges with weight equal to this value are retained
+
+    Returns
+    -------
+    TokenGraph
+        a copy of the graph with filtered edges
+    """
+    original_graph_edges = copy.deepcopy(graph.edges)
+    filtered_graph = graph.copy()
+
+    if not any([bound_lower, bound_upper]):
+        logger.warning('No bounds provided, returning original graph.')
+        return filtered_graph
+
+    for edge in original_graph_edges:
+        weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
+        if bound_lower is not None and weight < bound_lower:
+            filtered_graph.remove_edge(edge[0], edge[1])
+        if bound_upper is not None and weight > bound_upper:
+            filtered_graph.remove_edge(edge[0], edge[1])
+
+    if filtered_graph._undirected is not None:
+        filtered_graph.to_undirected(inplace=True, logging=False)
+
+    filtered_graph.update_metadata(logging=False)
+
+    return filtered_graph
+
+
+def filter_graph_by_node_degree(
+    graph: TokenGraph,
+    bound_lower: int | None,
+    bound_upper: int | None,
+) -> TokenGraph:
+    """filters all nodes which are within the provided bounds by their degree
+
+    Parameters
+    ----------
+    bound_lower : int | None
+        lower bound for node degree, nodes with degree equal to this value are retained
+    bound_upper : int | None
+        upper bound for node degree, nodes with degree equal to this value are retained
+
+    Returns
+    -------
+    TokenGraph
+        a copy of the graph with filtered nodes
+    """
+    # filter nodes by degree
+    original_graph_nodes = copy.deepcopy(graph.nodes)
+    filtered_graph = graph.copy()
+
+    if not any([bound_lower, bound_upper]):
+        logger.warning('No bounds provided, returning original graph.')
+        return filtered_graph
+
+    for node in original_graph_nodes:
+        degree = filtered_graph.degree[node]  # type: ignore
+        if bound_lower is not None and degree < bound_lower:
+            filtered_graph.remove_node(node)
+        if bound_upper is not None and degree > bound_upper:
+            filtered_graph.remove_node(node)
+
+    if filtered_graph._undirected is not None:
+        filtered_graph.to_undirected(inplace=True, logging=False)
+
+    filtered_graph.update_metadata(logging=False)
+
+    return filtered_graph
+
+
+# ** ---------------------------------------
 class TokenGraph(DiGraph):
    def __init__(
        self,
@ -286,87 +372,6 @@ class TokenGraph(DiGraph):
                graph=self._undirected, logging=logging
            )

-    def filter_by_edge_weight(
-        self,
-        bound_lower: int | None,
-        bound_upper: int | None,
-    ) -> Self:
-        """filters all edges which are within the provided bounds
-
-        Parameters
-        ----------
-        bound_lower : int | None
-            lower bound for edge weights, edges with weight equal to this value are retained
-        bound_upper : int | None
-            upper bound for edge weights, edges with weight equal to this value are retained
-
-        Returns
-        -------
-        Self
-            a copy of the graph with filtered edges
-        """
-        original_graph_edges = copy.deepcopy(self.edges)
-        filtered_graph = self.copy()
-
-        if not any([bound_lower, bound_upper]):
-            logger.warning('No bounds provided, returning original graph.')
-            return filtered_graph
-
-        for edge in original_graph_edges:
-            weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
-            if bound_lower is not None and weight < bound_lower:
-                filtered_graph.remove_edge(edge[0], edge[1])
-            if bound_upper is not None and weight > bound_upper:
-                filtered_graph.remove_edge(edge[0], edge[1])
-
-        if filtered_graph._undirected is not None:
-            filtered_graph.to_undirected(inplace=True, logging=False)
-
-        filtered_graph.update_metadata(logging=False)
-
-        return filtered_graph
-
-    def filter_by_node_degree(
-        self,
-        bound_lower: int | None,
-        bound_upper: int | None,
-    ) -> Self:
-        """filters all nodes which are within the provided bounds by their degree
-
-        Parameters
-        ----------
-        bound_lower : int | None
-            lower bound for node degree, nodes with degree equal to this value are retained
-        bound_upper : int | None
-            upper bound for node degree, nodes with degree equal to this value are retained
-
-        Returns
-        -------
-        Self
-            a copy of the graph with filtered nodes
-        """
-        # filter nodes by degree
-        original_graph_nodes = copy.deepcopy(self.nodes)
-        filtered_graph = self.copy()
-
-        if not any([bound_lower, bound_upper]):
-            logger.warning('No bounds provided, returning original graph.')
-            return filtered_graph
-
-        for node in original_graph_nodes:
-            degree = filtered_graph.degree[node]  # type: ignore
-            if bound_lower is not None and degree < bound_lower:
-                filtered_graph.remove_node(node)
-            if bound_upper is not None and degree > bound_upper:
-                filtered_graph.remove_node(node)
-
-        if filtered_graph._undirected is not None:
-            filtered_graph.to_undirected(inplace=True, logging=False)
-
-        filtered_graph.update_metadata(logging=False)
-
-        return filtered_graph
-
    def _save_prepare(
        self,
        path: Path,
@ -379,14 +384,13 @@ class TokenGraph(DiGraph):

        return saving_path

-    def save_graph(
+    def to_GraphML(
        self,
        path: Path,
        filename: str | None = None,
        directed: bool = False,
    ) -> None:
-        """save one of the stored graphs to disk file,
-        currently only GraphML format is supported
+        """save one of the stored graphs to GraphML format on disk,

        Parameters
        ----------
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -22,7 +22,7 @@ from lang_main.analysis.shared import (
    similar_index_groups,
 )
 from lang_main.loggers import logger_preprocess as logger
-from lang_main.pipelines.base import BasePipeline
+from lang_main.pipelines.base import Pipeline
 from lang_main.types import Embedding, PandasIndex

 # ** RE patterns
@ -119,10 +119,9 @@ def remove_duplicates(
    ).copy()
    logger.info('Removed all duplicates from dataset successfully.')
    logger.info(
-        (
-            f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
-            f'number of features {len(wo_duplicates.columns)}'
-        )
+        'New Dataset properties: number of entries: %d, number of features %d',
+        len(wo_duplicates),
+        len(wo_duplicates.columns),
    )

    return (wo_duplicates,)
@ -176,6 +175,7 @@ def clean_string_slim(string: str) -> str:
    string = pattern_special_chars.sub(' ', string)
    string = pattern_repeated_chars.sub(r'\1', string)
    # string = pattern_dates.sub('', string)
+    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
    string = string.strip()
@ -241,11 +241,84 @@ def analyse_feature(
    return (result_df,)


+# ** pre-filter
+def numeric_pre_filter_feature(
+    data: DataFrame,
+    feature: str,
+    bound_lower: int | None,
+    bound_upper: int | None,
+) -> tuple[DataFrame]:
+    if not any([bound_lower, bound_upper]):
+        raise ValueError('No bounds for filtering provided')
+
+    data = data.copy()
+    if bound_lower is None:
+        bound_lower = cast(int, data[feature].min())
+    if bound_upper is None:
+        bound_upper = cast(int, data[feature].max())
+
+    filter_lower = data[feature] >= bound_lower
+    filter_upper = data[feature] <= bound_upper
+    filter = filter_lower & filter_upper
+
+    data = data.loc[filter]
+
+    return (data,)
+
+
 # ** embedding based similarity
 # following functions used to identify similar entries to have
 # a more robust identification of duplicates negating negative side effects
 # of several disturbances like typos, escape characters, etc.
 # build mapping of embeddings for given model
+def merge_similarity_dupl(
+    data: DataFrame,
+    model: SentenceTransformer,
+    cos_sim_threshold: float,
+) -> tuple[DataFrame]:
+    logger.info('Start merging of similarity candidates...')
+
+    # data
+    merged_data = data.copy()
+    model_input = merged_data['entry']
+    candidates_idx = candidates_by_index(
+        data_model_input=model_input,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+    )
+    # graph of similar ids
+    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
+
+    for similar_id_group in similar_index_groups(similar_id_graph):
+        similar_id_group = list(similar_id_group)
+        similar_data = merged_data.loc[similar_id_group, :]
+        # keep first entry with max number occurrences, then number of
+        # associated objects, then length of entry
+        similar_data = similar_data.sort_values(
+            by=['num_occur', 'num_assoc_obj_ids', 'len'],
+            ascending=[False, False, False],
+        )
+        # merge information to first entry
+        data_idx = cast(PandasIndex, similar_data.index[0])
+        similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
+        assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
+        assoc_obj_ids = np.concatenate(assoc_obj_ids)
+        assoc_obj_ids = np.unique(assoc_obj_ids)
+        similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
+        similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
+        # remaining indices, should be removed
+        similar_id_group.remove(data_idx)
+        merged_similar_data = similar_data.drop(index=similar_id_group)
+        # update entry in main dataset, drop remaining entries
+        merged_data.update(merged_similar_data)
+        merged_data = merged_data.drop(index=similar_id_group)
+
+    logger.info('Similarity candidates merged successfully.')
+
+    return (merged_data,)
+
+
+#####################################################################
 def build_embedding_map(
    data: Series,
    model: GermanSpacyModel | SentenceTransformer,
@ -373,7 +446,7 @@ def list_cosSim_dupl_candidates(
    save_candidates: bool = False,
    saving_path: Path | None = None,
    filename: str = 'CosSim-FilterCandidates',
-    pipeline: BasePipeline | None = None,
+    pipeline: Pipeline | None = None,
 ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
    """providing an overview of candidates with a similarity score greater than
    given threshold; more suitable for debugging purposes
@ -465,53 +538,6 @@ def similar_ids_groups(
 """


-def merge_similarity_dupl(
-    data: DataFrame,
-    model: SentenceTransformer,
-    cos_sim_threshold: float,
-) -> tuple[DataFrame]:
-    logger.info('Start merging of similarity candidates...')
-
-    # data
-    merged_data = data.copy()
-    model_input = merged_data['entry']
-    candidates_idx = candidates_by_index(
-        data_model_input=model_input,
-        model=model,
-        cos_sim_threshold=cos_sim_threshold,
-    )
-    # graph of similar ids
-    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
-
-    for similar_id_group in similar_index_groups(similar_id_graph):
-        similar_id_group = list(similar_id_group)
-        similar_data = merged_data.loc[similar_id_group, :]
-        # keep first entry with max number occurrences, then number of
-        # associated objects, then length of entry
-        similar_data = similar_data.sort_values(
-            by=['num_occur', 'num_assoc_obj_ids', 'len'],
-            ascending=[False, False, False],
-        )
-        # merge information to first entry
-        data_idx = cast(PandasIndex, similar_data.index[0])
-        similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
-        assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
-        assoc_obj_ids = np.concatenate(assoc_obj_ids)
-        assoc_obj_ids = np.unique(assoc_obj_ids)
-        similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
-        similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
-        # remaining indices, should be removed
-        similar_id_group.remove(data_idx)
-        merged_similar_data = similar_data.drop(index=similar_id_group)
-        # update entry in main dataset, drop remaining entries
-        merged_data.update(merged_similar_data)
-        merged_data = merged_data.drop(index=similar_id_group)
-
-    logger.info('Similarity candidates merged successfully.')
-
-    return (merged_data.copy(),)
-
-
 # merge duplicates
 def merge_similarity_dupl_old(
    data: DataFrame,
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -24,13 +24,13 @@ PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
 # if not PATH_TO_DATASET.exists():
 #     raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
 # ** control
-DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
+# DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
 SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
-DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
+# DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
 SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
-DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
+# DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
 SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
-DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
+# DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
 SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']

 # ** models
@ -66,11 +66,11 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
 ]
 FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
 # ** time_analysis.model_input
-MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
+MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['input_features']
 )
 ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
-ACTIVITY_TYPES: Final[tuple[str]] = tuple(
+ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['activity_types']
 )
 THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
--- a/src/lang_main/io.py
+++ b/src/lang_main/io.py
@ -1,4 +1,3 @@
-import os
 import pickle
 import shutil
 import tomllib
@ -22,7 +21,7 @@ def create_saving_folder(
        if overwrite_existing:
            # overwrite if desired (deletes whole path and re-creates it)
            shutil.rmtree(saving_path_folder)
-            os.makedirs(saving_path_folder)
+            saving_path_folder.mkdir(parents=True)
        else:
            logger.info(
                (
@ -62,56 +61,14 @@ def load_pickle(
    return obj


-# TODO: remove, too specialised for common application
-"""
-def filter_candidates_idx(
-    data_model_input: Series,
-    model: SentenceTransformer,
-    cos_sim_threshold: float,
-) -> Iterator[tuple[PandasIndex, PandasIndex]]:
-    common function to filter candidate indices based on cosine similarity
-    using SentenceTransformer model in batch mode,
-    feed of data as Series to retain information about indices of entries
-
-    Parameters
-    ----------
-    data_model_input : Series
-        containing indices and text entries to process
-    model : SentenceTransformer
-        necessary SentenceTransformer model to encode text entries
-    cos_sim_threshold : float
-        threshold for cosine similarity to filter candidates
-
-    Yields
-    ------
-    Iterator[tuple[PandasIndex, PandasIndex]]
-        index pairs which meet the cosine similarity threshold
-    
-    
-    # embeddings
-    batch = typing.cast(list[str],
-                        data_model_input.to_list())
-    embds = typing.cast(Tensor,
-                        model.encode(
-                            batch, 
-                            convert_to_numpy=False,
-                            convert_to_tensor=True,
-                            show_progress_bar=False,
-                        ))
-    
-    # cosine similarity
-    cos_sim = typing.cast(
-        npt.NDArray,
-        sentence_transformers.util.cos_sim(embds, embds).numpy()
-    )
-    np.fill_diagonal(cos_sim, 0.)
-    cos_sim = np.triu(cos_sim)
-    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
-    
-    for idx_array in cos_sim_idx:
-        idx_pair = typing.cast(
-            tuple[np.int64, np.int64],
-            tuple(data_model_input.index[idx] for idx in idx_array)
+def get_entry_point(
+    saving_path: Path,
+    filename: str,
+) -> Path:
+    entry_point_path = (saving_path / filename).with_suffix('.pkl')
+    if not entry_point_path.exists():
+        raise FileNotFoundError(
+            f'Could not find provided entry data under path: >>{entry_point_path}<<'
        )
-        yield idx_pair
-"""
+
+    return entry_point_path
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -9,14 +9,12 @@ dataset = './01_2_Rohdaten_neu/Export4.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'

+# only debugging features, production-ready pipelines should always
+# be fully executed
 [control]
-preprocessing = true
 preprocessing_skip = false
-token_analysis = false
 token_analysis_skip = false
-graph_postprocessing = false
 graph_postprocessing_skip = false
-time_analysis = false
 time_analysis_skip = false

 #[export_filenames]
@ -42,9 +40,12 @@ criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'

 [time_analysis.model_input]
+# input_features = [
+#     'VorgangsTypName',
+#     'VorgangsArtText',
+#     'VorgangsBeschreibung',
+# ]
 input_features = [
-    'VorgangsTypName',
-    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -1,9 +1,14 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
 from collections.abc import Callable
 from pathlib import Path
-from typing import Any
+from typing import Any, Never, cast
+from typing_extensions import override

-from lang_main.loggers import logger_pipelines as logger
 from lang_main.io import load_pickle, save_pickle
+from lang_main.loggers import logger_pipelines as logger
+from lang_main.types import ResultHandling

 # ** pipelines to perform given actions on dataset in a customisable manner

@ -12,7 +17,18 @@ class NoPerformableActionError(Exception):
    """Error describing that no action is available in the current pipeline"""


-class BasePipeline:
+class WrongActionTypeError(Exception):
+    """Error raised if added action type is not supported by corresponding pipeline"""
+
+
+class OutputInPipelineContainerError(Exception):
+    """Error raised if an output was detected by one of the performed
+    actions in a PipelineContainer. Each action in a PipelineContainer is itself a
+    procedure which does not have any parameters or return values and should therefore not
+    return any values."""
+
+
+class BasePipeline(ABC):
    def __init__(
        self,
        name: str,
@ -25,18 +41,12 @@ class BasePipeline:
        self.name = name
        # working directory for pipeline == output path
        self.working_dir = working_dir
-        # if not self.working_dir.exists():
-        #     self.working_dir.mkdir(parents=True)

        # container for actions to perform during pass
        self.actions: list[Callable] = []
        self.action_names: list[str] = []
-        self.actions_kwargs: list[dict[str, Any]] = []
-        self.is_save_result: list[bool] = []
        # progress tracking, start at 1
        self.curr_proc_idx: int = 1
-        # intermediate result
-        self._intermediate_result: Any | None = None

    def __repr__(self) -> str:
        return (
@ -44,15 +54,132 @@ class BasePipeline:
            f'working dir: {self.working_dir}, contents: {self.action_names})'
        )

-    @property
-    def intermediate_result(self) -> Any:
-        return self._intermediate_result
+    def panic_wrong_action_type(
+        self,
+        action: Any,
+        compatible_type: str,
+    ) -> Never:
+        raise WrongActionTypeError(
+            (
+                f'Action must be of type {compatible_type}, '
+                f'but is of type >>{type(action)}<<.'
+            )
+        )

+    def prep_run(self) -> None:
+        logger.info('Starting pipeline >>%s<<...', self.name)
+        # progress tracking
+        self.curr_proc_idx = 1
+        # check if performable actions available
+        if len(self.actions) == 0:
+            raise NoPerformableActionError(
+                'The pipeline does not contain any performable actions.'
+            )
+
+    def post_run(self) -> None:
+        logger.info(
+            'Processing pipeline >>%s<< successfully ended after %d steps.',
+            self.name,
+            (self.curr_proc_idx - 1),
+        )
+
+    @abstractmethod
+    def add(self) -> None: ...
+
+    @abstractmethod
+    def logic(self) -> None: ...
+
+    def run(self, *args, **kwargs) -> Any:
+        self.prep_run()
+        ret = self.logic(*args, **kwargs)
+        self.post_run()
+        return ret
+
+
+class PipelineContainer(BasePipeline):
+    def __init__(
+        self,
+        name: str,
+        working_dir: Path,
+    ) -> None:
+        super().__init__(name=name, working_dir=working_dir)
+
+        self.action_skip: list[bool] = []
+
+    @override
+    def add(
+        self,
+        action: Callable,
+        skip: bool = False,
+    ) -> None:
+        if isinstance(action, Callable):
+            self.actions.append(action)
+            self.action_names.append(action.__name__)
+            self.action_skip.append(skip)
+        else:
+            self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
+
+    @override
+    def logic(self) -> None:
+        for idx, (action, action_name) in enumerate(zip(self.actions, self.action_names)):
+            # loading
+            if self.action_skip[idx]:
+                logger.info('[No Calculation] Skipping >>%s<<...', action_name)
+                self.curr_proc_idx += 1
+                continue
+            # calculation
+            ret = action()
+            if ret is not None:
+                raise OutputInPipelineContainerError(
+                    (
+                        f'Output in PipelineContainers not allowed. Action {action_name} '
+                        f'returned values in Container {self.name}.'
+                    )
+                )
+            # processing tracking
+            self.curr_proc_idx += 1
+
+
+class Pipeline(BasePipeline):
+    def __init__(
+        self,
+        name: str,
+        working_dir: Path,
+    ) -> None:
+        # init base class
+        super().__init__(name=name, working_dir=working_dir)
+
+        # name of pipeline
+        self.name = name
+        # working directory for pipeline == output path
+        self.working_dir = working_dir
+        # if not self.working_dir.exists():
+        #     self.working_dir.mkdir(parents=True)
+
+        # container for actions to perform during pass
+        self.actions_kwargs: list[dict[str, Any]] = []
+        self.save_results: ResultHandling = []
+        self.load_results: ResultHandling = []
+        # intermediate result
+        self._intermediate_result: tuple[Any, ...] | None = None
+
+    def __repr__(self) -> str:
+        return (
+            f'{self.__class__.__name__}(name: {self.name}, '
+            f'working dir: {self.working_dir}, contents: {self.action_names})'
+        )
+
+    # @property
+    # def intermediate_result(self) -> tuple[Any, ...] | None:
+    #     return self._intermediate_result
+    @override
    def add(
        self,
        action: Callable,
        action_kwargs: dict[str, Any] = {},
        save_result: bool = False,
+        load_result: bool = False,
+        filename: str | None = None,
    ) -> None:
        # check explicitly for function type
        # if isinstance(action, FunctionType):
@ -60,11 +187,10 @@ class BasePipeline:
            self.actions.append(action)
            self.action_names.append(action.__name__)
            self.actions_kwargs.append(action_kwargs.copy())
-            self.is_save_result.append(save_result)
+            self.save_results.append((save_result, filename))
+            self.load_results.append((load_result, filename))
        else:
-            raise TypeError(
-                f'Action must be custom function, but is of type >>{type(action)}<<.'
-            )
+            self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)

    # TODO: add multiple entries by utilising simple add method
    """
@ -88,57 +214,84 @@ class BasePipeline:
                             f"but is of type >>{type(action)}<<."))
    """

-    def save_curr_result(
+    def get_result_path(
        self,
-        filename: str,
+        action_idx: int,
+        filename: str | None,
+    ) -> tuple[Path, str]:
+        action_name = self.action_names[action_idx]
+        if filename is None:
+            target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_{action_name}'
+        else:
+            target_filename = filename
+        target_path = self.working_dir.joinpath(target_filename).with_suffix('.pkl')
+        return target_path, action_name
+
+    def load_step(
+        self,
+        action_idx: int,
+        filename: str | None,
+    ) -> tuple[Any, ...]:
+        target_path, action_name = self.get_result_path(action_idx, filename)
+
+        if not target_path.exists():
+            raise FileNotFoundError(
+                (
+                    f'No intermediate results for action >>{action_name}<< '
+                    f'under >>{target_path}<< found'
+                )
+            )
+        # results should be tuple, but that is not guaranteed
+        result_loaded = cast(tuple[Any, ...], load_pickle(target_path))
+        if not isinstance(result_loaded, tuple):
+            raise TypeError(f'Loaded results must be tuple, not {type(result_loaded)}')
+
+        return result_loaded
+
+    def save_step(
+        self,
+        action_idx: int,
+        filename: str | None,
    ) -> None:
-        target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
-        target_path = self.working_dir.joinpath(target_filename)
-        target_path = target_path.with_suffix('.pkl')
+        # target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
+        # target_path = self.working_dir.joinpath(target_filename)
+        # target_path = target_path.with_suffix('.pkl')
+        target_path, _ = self.get_result_path(action_idx, filename)
        # saving file locally
        save_pickle(obj=self._intermediate_result, path=target_path)

-    def load_intermediate_result(
-        self,
-        saving_path: str,
-        filename: str,
-    ) -> tuple[Any, ...]:
-        target_path = Path(saving_path + filename).with_suffix('.pkl')
-        # loading DataFrame or Series from pickle
-        data = load_pickle(target_path)
-
-        return data
-
-    def prep_run(self) -> None:
-        logger.info('Starting processing pipeline >>%s<<...', self.name)
-        # progress tracking
-        self.curr_proc_idx = 1
-        # check if performable actions available
-        if len(self.actions) == 0:
-            raise NoPerformableActionError(
-                'The pipeline does not contain any performable actions.'
-            )
-
-    def run(
+    @override
+    def logic(
        self,
        starting_values: tuple[Any, ...],
    ) -> tuple[Any, ...]:
-        # prepare start
-        self.prep_run()
-
        for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
+            # loading
+            if self.load_results[idx][0]:
+                filename = self.load_results[idx][1]
+                ret = self.load_step(action_idx=idx, filename=filename)
+                logger.info(
+                    '[No Calculation] Loaded result for action >>%s<< successfully',
+                    self.action_names[idx],
+                )
+                self.curr_proc_idx += 1
+                continue
+            # calculation
            if idx == 0:
                ret = action(*starting_values, **action_kwargs)
            else:
                ret = action(*ret, **action_kwargs)
+
+            if not isinstance(ret, tuple):
+                ret = (ret,)
+            ret = cast(tuple[Any, ...], ret)
            # save intermediate result
            self._intermediate_result = ret
-            # check if result should be saved locally
-            if self.is_save_result[idx]:
-                self.save_curr_result(filename=self.action_names[idx])
+            # saving result locally, always save last action
+            if self.save_results[idx][0] or idx == (len(self.actions) - 1):
+                filename = self.save_results[idx][1]
+                self.save_step(action_idx=idx, filename=filename)
            # processing tracking
            self.curr_proc_idx += 1

-        logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
-
        return ret
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -1,9 +1,11 @@
+from lang_main.analysis import graphs
 from lang_main.analysis.preprocessing import (
    analyse_feature,
    clean_string_slim,
    entry_wise_cleansing,
    load_raw_data,
    merge_similarity_dupl,
+    numeric_pre_filter_feature,
    remove_duplicates,
    remove_NA,
 )
@ -23,40 +25,50 @@ from lang_main.constants import (
    SAVE_PATH_FOLDER,
    SPCY_MODEL,
    STFR_MODEL,
+    THRESHOLD_AMOUNT_CHARACTERS,
+    THRESHOLD_EDGE_WEIGHT,
    THRESHOLD_NUM_ACTIVITIES,
    THRESHOLD_SIMILARITY,
    THRESHOLD_TIMELINE_SIMILARITY,
    THRESHOLD_UNIQUE_TEXTS,
    UNIQUE_CRITERION_FEATURE,
 )
-from lang_main.pipelines.base import BasePipeline
+from lang_main.pipelines.base import Pipeline
+from lang_main.types import EntryPoints
+

 # ** pipeline configuration
 # ** target feature preparation
-pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
-pipe_target_feat.add(
-    load_raw_data,
-    {
-        'date_cols': DATE_COLS,
-    },
-)
-pipe_target_feat.add(remove_duplicates)
-pipe_target_feat.add(remove_NA, save_result=True)
-pipe_target_feat.add(
-    entry_wise_cleansing,
-    {
-        'target_feature': 'VorgangsBeschreibung',
-        'cleansing_func': clean_string_slim,
-    },
-    save_result=True,
-)
-pipe_target_feat.add(
-    analyse_feature,
-    {
-        'target_feature': 'VorgangsBeschreibung',
-    },
-    save_result=True,
-)
+def build_base_target_feature_pipe() -> Pipeline:
+    pipe_target_feat = Pipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
+    pipe_target_feat.add(
+        load_raw_data,
+        {
+            'date_cols': DATE_COLS,
+        },
+    )
+    pipe_target_feat.add(remove_duplicates)
+    pipe_target_feat.add(remove_NA, save_result=True)
+    pipe_target_feat.add(
+        entry_wise_cleansing,
+        {
+            'target_feature': 'VorgangsBeschreibung',
+            'cleansing_func': clean_string_slim,
+        },
+        save_result=True,
+        filename=EntryPoints.TIMELINE,
+    )
+    pipe_target_feat.add(
+        analyse_feature,
+        {
+            'target_feature': 'VorgangsBeschreibung',
+        },
+        save_result=True,
+    )
+
+    return pipe_target_feat
+
+
 # output: DataFrame containing target feature with
 # number of occurrences and associated ObjectIDs

@ -81,68 +93,114 @@ pipe_target_feat.add(
 #     save_result=True,
 # )

+
 # ** Merge duplicates
-pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
-# pipe_merge.add(merge_similarity_dupl, save_result=True)
-pipe_merge.add(
-    merge_similarity_dupl,
-    {
-        'model': STFR_MODEL,
-        'cos_sim_threshold': THRESHOLD_SIMILARITY,
-    },
-    save_result=True,
-)
+def build_merge_duplicates_pipe() -> Pipeline:
+    pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
+    # pipe_merge.add(merge_similarity_dupl, save_result=True)
+    pipe_merge.add(
+        numeric_pre_filter_feature,
+        {
+            'feature': 'len',
+            'bound_lower': THRESHOLD_AMOUNT_CHARACTERS,
+            'bound_upper': None,
+        },
+    )
+    pipe_merge.add(
+        merge_similarity_dupl,
+        {
+            'model': STFR_MODEL,
+            'cos_sim_threshold': THRESHOLD_SIMILARITY,
+        },
+        save_result=True,
+        filename=EntryPoints.TOKEN_ANALYSIS,
+    )
+
+    return pipe_merge
+

 # ** token analysis
-pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
-pipe_token_analysis.add(
-    build_token_graph,
-    {
-        'model': SPCY_MODEL,
-        'target_feature': 'entry',
-        'weights_feature': 'num_occur',
-        'batch_idx_feature': 'batched_idxs',
-        'build_map': True,
-        'batch_size_model': 50,
-    },
-    save_result=True,
-)
+def build_tk_graph_pipe() -> Pipeline:
+    pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
+    pipe_token_analysis.add(
+        build_token_graph,
+        {
+            'model': SPCY_MODEL,
+            'target_feature': 'entry',
+            'weights_feature': 'num_occur',
+            'batch_idx_feature': 'batched_idxs',
+            'build_map': False,
+            'batch_size_model': 50,
+        },
+        save_result=True,
+        filename=EntryPoints.TK_GRAPH_POST,
+    )
+
+    return pipe_token_analysis
+
+
+def build_tk_graph_post_pipe() -> Pipeline:
+    pipe_graph_postprocessing = Pipeline(
+        name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
+    )
+    pipe_graph_postprocessing.add(
+        graphs.filter_graph_by_edge_weight,
+        {
+            'bound_lower': THRESHOLD_EDGE_WEIGHT,
+            'bound_upper': None,
+        },
+    )
+    pipe_graph_postprocessing.add(
+        graphs.filter_graph_by_node_degree,
+        {
+            'bound_lower': 1,
+            'bound_upper': None,
+        },
+        save_result=True,
+        filename=EntryPoints.TK_GRAPH_ANALYSIS,
+    )
+
+    return pipe_graph_postprocessing


 # ** timeline analysis
-pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
-pipe_timeline.add(
-    remove_non_relevant_obj_ids,
-    {
-        'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
-        'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
-    },
-    save_result=True,
-)
-pipe_timeline.add(
-    generate_model_input,
-    {
-        'target_feature_name': 'nlp_model_input',
-        'model_input_features': MODEL_INPUT_FEATURES,
-    },
-)
-pipe_timeline.add(
-    filter_activities_per_obj_id,
-    {
-        'activity_feature': ACTIVITY_FEATURE,
-        'relevant_activity_types': ACTIVITY_TYPES,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
-        'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
-    },
-)
-pipe_timeline.add(
-    get_timeline_candidates,
-    {
-        'model': STFR_MODEL,
-        'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
-        'feature_obj_id': FEATURE_NAME_OBJ_ID,
-        'model_input_feature': 'nlp_model_input',
-    },
-    save_result=True,
-)
+def build_timeline_pipe() -> Pipeline:
+    pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
+    pipe_timeline.add(
+        remove_non_relevant_obj_ids,
+        {
+            'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
+            'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
+        },
+        save_result=True,
+    )
+    pipe_timeline.add(
+        generate_model_input,
+        {
+            'target_feature_name': 'nlp_model_input',
+            'model_input_features': MODEL_INPUT_FEATURES,
+        },
+    )
+    pipe_timeline.add(
+        filter_activities_per_obj_id,
+        {
+            'activity_feature': ACTIVITY_FEATURE,
+            'relevant_activity_types': ACTIVITY_TYPES,
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
+            'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
+        },
+    )
+    pipe_timeline.add(
+        get_timeline_candidates,
+        {
+            'model': STFR_MODEL,
+            'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
+            'feature_obj_id': FEATURE_NAME_OBJ_ID,
+            'model_input_feature': 'nlp_model_input',
+        },
+        save_result=True,
+        filename=EntryPoints.TIMELINE_POST,
+    )
+
+    return pipe_timeline
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -17,8 +17,20 @@ class LoggingLevels(enum.IntEnum):

 # ** devices
 class STFRDeviceTypes(enum.StrEnum):
-    CPU = 'cpu'
-    GPU = 'cuda'
+    CPU = enum.auto()
+    GPU = enum.auto()
+
+
+# ** pipelines
+ResultHandling: TypeAlias = list[tuple[bool, str | None]]
+
+
+class EntryPoints(enum.StrEnum):
+    TIMELINE = 'TIMELINE'
+    TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
+    TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
+    TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
+    TOKEN_ANALYSIS = 'TOKEN_ANALYSIS'


 # ** datasets
--- a/test-notebooks/misc.ipynb
+++ b/test-notebooks/misc.ipynb