sandboxing

2024-08-05 08:43:45 +02:00 · 2024-08-05 08:43:45 +02:00 · 3f58a14852
commit 3f58a14852
parent 9197146d2c
10 changed files with 2362 additions and 283 deletions
--- a/scripts/dash_timeline_static.py
+++ b/scripts/dash_timeline_static.py
@ -1,9 +1,11 @@
 import time
 import webbrowser
-from pathlib import Path
+from collections.abc import Collection, Iterable
 from threading import Thread
 from typing import Any, Final, cast

+import pandas as pd
+
 # import dash_cytoscape as cyto
 import plotly.express as px
 from dash import (
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure

 import lang_main.io
 from lang_main.analysis import graphs, tokens
-from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL
+from lang_main.analysis.timeline import (
+    calc_delta_to_next_failure,
+    filter_timeline_cands,
+)
+from lang_main.constants import (
+    NAME_DELTA_FEAT_TO_NEXT_FAILURE,
+    NAME_DELTA_FEAT_TO_REPAIR,
+    SAVE_PATH_FOLDER,
+    SPCY_MODEL,
+)
 from lang_main.errors import EmptyEdgesError, EmptyGraphError
 from lang_main.pipelines.predefined import (
    build_tk_graph_render_pipe,
    build_tk_graph_rescaling_pipe,
 )
-from lang_main.types import EntryPoints, ObjectID, TimelineCandidates
+from lang_main.types import (
+    DataFrameTLFiltered,
+    EntryPoints,
+    HTMLColumns,
+    HTMLTable,
+    ObjectID,
+    TimelineCandidates,
+)

 # ** data
 # p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
-p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
+p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
 (data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
+# data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
 # p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
-p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
+p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
 cands, texts = cast(
    tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
 )
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
    file_ext='.svg',
 )

-
-TABLE_FEATS: Final[list[str]] = [
+# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
+TABLE_FEATS_OVERVIEW: Final[list[str]] = [
    'ErstellungsDatum',
    'ErledigungsDatum',
+    NAME_DELTA_FEAT_TO_REPAIR,
    'VorgangsTypName',
    'VorgangsBeschreibung',
+    'ErledigungsBeschreibung',
 ]
 TABLE_FEATS_DATES: Final[list[str]] = [
    'ErstellungsDatum',
    'ErledigungsDatum',
 ]
+TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
+    'ErstellungsDatum',
+    'ErledigungsDatum',
+    'VorgangsTypName',
+    'VorgangsBeschreibung',
+    'ErledigungsBeschreibung',
+    NAME_DELTA_FEAT_TO_NEXT_FAILURE,
+]

 # ** figure config
 MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'ErledigungsDatum': '|%d.%m.%Y',
    'VorgangsBeschreibung': True,
+    'ErledigungsBeschreibung': True,
 }
 HOVER_DATA_DELTA: Final[dict[str, Any]] = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'ErledigungsDatum': '|%d.%m.%Y',
    'VorgangsDatum': '|%d.%m.%Y',
-    'delta': True,
+    NAME_DELTA_FEAT_TO_REPAIR: True,
    'VorgangsBeschreibung': True,
+    'ErledigungsBeschreibung': True,
 }

 # ** graph
@ -136,10 +167,10 @@ graph_layout = html.Div(
                            html.Img(
                                id='static-graph-img',
                                alt='static rendered graph',
-                                # style={
-                                #     'width': 'auto',
-                                #     'height': 'auto',
-                                # },
+                                style={
+                                    'width': 'auto',
+                                    'height': 'auto',
+                                },
                            ),
                            html.P(id='info-graph-errors', children=[]),
                        ],
@ -186,7 +217,27 @@ app.layout = html.Div(
            ]
        ),
        html.Div(
-            [dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
+            children=[
+                html.Div(
+                    [
+                        html.H5('Überblick ähnlicher Vorgänge'),
+                        dash_table.DataTable(id='table-candidates'),
+                    ],
+                    style={'paddingBottom': '1em'},
+                ),
+                html.Div(
+                    [
+                        html.H5(
+                            (
+                                'Maßnahmen sortiert nach längstem Zeitraum '
+                                'bis zum nächsten Ereignis'
+                            )
+                        ),
+                        dash_table.DataTable(id='table-best-actions'),
+                    ]
+                ),
+            ],
+            style={'marginBottom': '2em', 'padding': '2em'},
        ),
        graph_layout,
    ],
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):


 # ** helpers to filter DataFrame
-def pre_filter_data(
+def filter_candidates(
    data: DataFrame,
    idx: int,
    obj_id: ObjectID,
-) -> DataFrame:
+) -> DataFrameTLFiltered:
+    # assert correct data type because of Dash
    idx = int(idx)
    obj_id = int(obj_id)
-    # data = data.copy()
-    cands_for_obj_id = cands[obj_id]
-    cands_choice = cands_for_obj_id[int(idx) - 1]
-    # data
-    data = data.loc[list(cands_choice)].sort_index()  # type: ignore
-    data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum']
-    data['delta'] = data['delta'].dt.days
+
+    data = filter_timeline_cands(
+        data=data,
+        cands=cands,
+        obj_id=obj_id,
+        entry_idx=(idx - 1),  # idx in Dashboard starts with 1
+    )

    return data

@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
    obj_text = texts[obj_id]
    title_occurrences = f'HObjektText: {obj_text}'
    title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    df = filter_candidates(data, idx=index, obj_id=obj_id)
    # figure
    fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
-    fig_delta = fig_timeline_delta(df, title_delta)
+    fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)

    return fig_occurrences, fig_delta

@ -293,11 +345,12 @@ def fig_timeline_occurrences(
 def fig_timeline_delta(
    df: DataFrame,
    title: str,
+    delta_feature: str,
 ) -> Figure:
    fig = px.scatter(
        data_frame=df,
        x='ErstellungsDatum',
-        y='delta',
+        y=delta_feature,
        title=title,
        hover_data=HOVER_DATA_DELTA,
    )
@ -309,25 +362,77 @@ def fig_timeline_delta(
    return fig


+def transform_to_HTML_table(
+    data: DataFrame,
+    target_features: Collection[str],
+    date_cols: Iterable[str] | None = None,
+    sorting_feature: str | None = None,
+    sorting_ascending: bool = True,
+) -> tuple[HTMLColumns, HTMLTable]:
+    target_features = list(target_features)
+    data = data.copy()
+    data = data.filter(items=target_features, axis=1)
+
+    if sorting_feature is not None:
+        data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
+
+    if date_cols is not None:
+        for col in date_cols:
+            data[col] = data[col].dt.strftime(r'%Y-%m-%d')
+
+    columns = [{'name': col, 'id': col} for col in data.columns]
+    table_data = data.to_dict('records')
+
+    return columns, table_data
+
+
+# 'table-best-actions'
 # ** HTML table
@callback(
-    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
+    [
+        Output('table-candidates', 'columns'),
+        Output('table-candidates', 'data'),
+        Output('table-best-actions', 'columns'),
+        Output('table-best-actions', 'data'),
+    ],
    Input('selector-candidates', 'value'),
    State('selector-obj_id', 'value'),
    prevent_initial_call=True,
 )
-def update_table_candidates(index, obj_id):
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
-    df = df.filter(items=TABLE_FEATS, axis=1).sort_values(
-        by='ErstellungsDatum', ascending=True
+def update_tables_candidates(
+    index,
+    obj_id,
+) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
+    cands = filter_candidates(data, idx=index, obj_id=obj_id)
+    overview_cols, overview_table = transform_to_HTML_table(
+        data=cands,
+        target_features=TABLE_FEATS_OVERVIEW,
+        date_cols=TABLE_FEATS_DATES,
+        sorting_feature='ErstellungsDatum',
+        sorting_ascending=True,
    )
-    cols = [{'name': i, 'id': i} for i in df.columns]
-    # convert dates to strings
-    for col in TABLE_FEATS_DATES:
-        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
+    # df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
+    #     by='ErstellungsDatum', ascending=True
+    # )
+    # cols = [{'name': i, 'id': i} for i in df.columns]
+    # # convert dates to strings
+    # for col in TABLE_FEATS_DATES:
+    #     df[col] = df[col].dt.strftime(r'%Y-%m-%d')

-    table_data = df.to_dict('records')
-    return table_data, cols
+    # table_data = df.to_dict('records')
+
+    cands_best_actions = calc_delta_to_next_failure(
+        data=cands,
+        date_feature='ErstellungsDatum',
+        name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
+    )
+    best_actions_cols, best_actions_table = transform_to_HTML_table(
+        data=cands_best_actions,
+        target_features=TABLE_FEATS_BEST_ACTIONS,
+        date_cols=TABLE_FEATS_DATES,
+    )
+
+    return overview_cols, overview_table, best_actions_cols, best_actions_table


 # ** graph callbacks
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
 def display_candidates_as_graph(index, obj_id):
    error_msg = ''
    t1 = time.perf_counter()
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    df = filter_candidates(data, idx=index, obj_id=obj_id)
    t2 = time.perf_counter()
    print(f'Time for filtering: {t2 - t1} s')

--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,9 +1,8 @@
-import re
-from collections.abc import Iterable
+from collections.abc import Collection
 from itertools import combinations
 from math import factorial
 from pathlib import Path
-from typing import Callable, cast
+from typing import cast

 import numpy as np
 import pandas as pd
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import Pipeline
 from lang_main.types import Embedding, PandasIndex

-# ** RE patterns
-pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
-pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
-pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
-pattern_whitespace = re.compile(r'[ ]{2,}')
+# TODO removal
+# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
+# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
+# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
+# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+# pattern_whitespace = re.compile(r'[ ]{2,}')


 # ** (1) dataset preparation: loading and simple preprocessing
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
 # duplicate cleansing based on all properties
 def load_raw_data(
    path: Path,
-    date_cols: Iterable[str] = (
+    date_cols: Collection[str] = (
        'VorgangsDatum',
        'ErledigungsDatum',
        'Arbeitsbeginn',
@ -50,7 +50,7 @@ def load_raw_data(
    ----------
    path : str
        path to dataset file, usually CSV file
-    date_cols : list[str], optional
+    date_cols : Collection[str], optional
        columns which contain dates and are parsed as such,
        by default (
            'VorgangsDatum',
@ -129,9 +129,7 @@ def remove_duplicates(

 def remove_NA(
    data: DataFrame,
-    target_features: list[str] = [
-        'VorgangsBeschreibung',
-    ],
+    target_features: Collection[str] = ('VorgangsBeschreibung',),
 ) -> tuple[DataFrame]:
    """function to drop NA entries based on a subset of features to be analysed

@ -139,14 +137,15 @@ def remove_NA(
    ----------
    data : DataFrame
        standard IHM dataset, perhaps pre-cleaned
-    target_features : list[str], optional
-        subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
+    target_features : Collection[str], optional
+        subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)

    Returns
    -------
    DataFrame
        dataset with removed NA entries for given subset of features
    """
+    target_features = list(target_features)
    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
    logger.info(
        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
@ -156,46 +155,7 @@ def remove_NA(


 # ** (2) entry-based cleansing
-# following functions clean and prepare specific entries, not whole dataset
-def clean_string_slim(string: str) -> str:
-    """mapping function to clean single string entries in a series (feature-wise)
-    of the dataset, used to be applied element-wise for string features
-
-    Parameters
-    ----------
-    string : str
-        dataset entry feature
-
-    Returns
-    -------
-    str
-        cleaned entry
-    """
-    # remove special chars
-    string = pattern_special_chars.sub(' ', string)
-    string = pattern_repeated_chars.sub(r'\1', string)
-    # string = pattern_dates.sub('', string)
-    # dates are used for context, should not be removed at this stage
-    string = pattern_whitespace.sub(' ', string)
-    # remove whitespaces at the beginning and the end
-    string = string.strip()
-
-    return string
-
-
-def entry_wise_cleansing(
-    data: DataFrame,
-    target_feature: str,
-    cleansing_func: Callable[[str], str],
-) -> tuple[DataFrame]:
-    # apply given cleansing function to target feature
-    data[target_feature] = data[target_feature].map(cleansing_func)
-    logger.info(
-        ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
-        cleansing_func.__name__,
-        target_feature,
-    )
-    return (data,)
+# ** moved to module ``lang_main.analysis.shared``


 # ** in-depth analysis of one feature
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -1,4 +1,5 @@
-from collections.abc import Iterable, Iterator
+import re
+from collections.abc import Callable, Collection, Iterable, Iterator
 from typing import cast

 import networkx as nx
@ -7,14 +8,70 @@ import numpy.typing as npt
 import sentence_transformers
 import sentence_transformers.util
 from networkx import Graph
-from pandas import Series
+from pandas import DataFrame, Series
 from sentence_transformers import SentenceTransformer
 from torch import Tensor

 from lang_main.analysis.graphs import get_graph_metadata, update_graph
+from lang_main.loggers import logger_preprocess as logger
 from lang_main.types import PandasIndex

+# ** RE patterns
+pattern_escape_newline = re.compile(r'[\n]+')
+pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
+pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
+pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
+pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+pattern_whitespace = re.compile(r'[ ]{2,}')

+
+# ** RE applications
+# following functions clean and prepare specific entries, not whole datasets
+def clean_string_slim(string: str) -> str:
+    """mapping function to clean single string entries in a series (feature-wise)
+    of the dataset, used to be applied element-wise for string features
+
+    Parameters
+    ----------
+    string : str
+        dataset entry feature
+
+    Returns
+    -------
+    str
+        cleaned entry
+    """
+    # remove special chars
+    string = pattern_escape_newline.sub('. ', string)
+    string = pattern_escape_seq.sub(' ', string)
+    string = pattern_repeated_chars.sub('', string)
+    # string = pattern_dates.sub('', string)
+    # dates are used for context, should not be removed at this stage
+    string = pattern_whitespace.sub(' ', string)
+    # remove whitespaces at the beginning and the end
+    string = string.strip()
+
+    return string
+
+
+# ** dataset cleansing
+def entry_wise_cleansing(
+    data: DataFrame,
+    target_features: Collection[str],
+    cleansing_func: Callable[[str], str] = clean_string_slim,
+) -> tuple[DataFrame]:
+    # apply given cleansing function to target feature
+    target_features = list(target_features)
+    data[target_features] = data[target_features].map(cleansing_func)
+    logger.info(
+        ('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
+        cleansing_func.__name__,
+        target_features,
+    )
+    return (data,)
+
+
+# ** similarities
 def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -1,4 +1,4 @@
-from collections.abc import Iterable, Iterator
+from collections.abc import Collection, Iterable, Iterator
 from typing import cast

 from pandas import DataFrame, Series
@ -7,14 +7,21 @@ from tqdm.auto import tqdm  # TODO: check deletion

 from lang_main.analysis.shared import (
    candidates_by_index,
+    entry_wise_cleansing,
+    pattern_escape_seq_sentences,
    similar_index_connection_graph,
    similar_index_groups,
 )
 from lang_main.loggers import logger_timeline as logger
-from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
+from lang_main.types import (
+    DataFrameTLFiltered,
+    ObjectID,
+    PandasIndex,
+    TimelineCandidates,
+)


-def non_relevant_obj_ids(
+def _non_relevant_obj_ids(
    data: DataFrame,
    thresh_unique_feat_per_id: int,
    *,
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
 ) -> tuple[DataFrame]:
-    logger.info('Removing non-relevant ObjectIDs from dataset')
+    logger.info('Removing non-relevant ObjectIDs from dataset...')
    data = data.copy()
-    ids_to_ignore = non_relevant_obj_ids(
+    ids_to_ignore = _non_relevant_obj_ids(
        data=data,
        thresh_unique_feat_per_id=thresh_unique_feat_per_id,
        feature_uniqueness=feature_uniqueness,
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
    # only retain entries with ObjectIDs not in IDs to ignore
    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
    logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
-    logger.info('Non-relevant ObjectIDs removed successfully')
+    logger.info('Non-relevant ObjectIDs removed successfully.')
+
+    return (data,)
+
+
+def cleanup_descriptions(
+    data: DataFrame,
+    properties: Collection[str] = (
+        'VorgangsBeschreibung',
+        'ErledigungsBeschreibung',
+    ),
+) -> tuple[DataFrame]:
+    logger.info('Cleaning necessary descriptions...')
+    data = data.copy()
+    features = list(properties)
+    data[features] = data[features].fillna('N.V.')
+    (data,) = entry_wise_cleansing(data, target_features=features)
+    logger.info('Cleansing successful.')
+
+    return (data.copy(),)
+
+
+def calc_delta_to_repair(
+    data: DataFrame,
+    date_feature_start: str = 'ErstellungsDatum',
+    date_feature_end: str = 'ErledigungsDatum',
+    name_delta_feature: str = 'delta_to_repair',
+    convert_to_days: bool = True,
+) -> tuple[DataFrame]:
+    logger.info('Calculating time differences between start and end of operations...')
+    data = data.copy()
+    data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
+
+    if convert_to_days:
+        data[name_delta_feature] = data[name_delta_feature].dt.days
+
+    logger.info('Calculation successful.')

    return (data,)

@ -75,7 +118,7 @@ def generate_model_input(
        'VorgangsBeschreibung',
    ),
 ) -> tuple[DataFrame]:
-    logger.info('Generating concatenation of model input features')
+    logger.info('Generating concatenation of model input features...')
    data = data.copy()
    model_input_features = list(model_input_features)
    input_features = data[model_input_features].fillna('').astype(str)
@ -83,7 +126,7 @@ def generate_model_input(
        lambda x: ' - '.join(x),
        axis=1,
    )
-    logger.info('Model input generated successfully')
+    logger.info('Model input generated successfully.')

    return (data,)

@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
 ) -> tuple[DataFrame, Series]:
    data = data.copy()
    # filter only relevant activities count occurrences for each ObjectID
-    logger.info('Filtering activities per ObjectID')
+    logger.info('Filtering activities per ObjectID...')
    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
    data_filter_activities = data.loc[filt_rel_activities].copy()
    num_activities_per_obj_id = cast(
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(

    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
-    logger.info('Activities per ObjectID filtered successfully')
+    logger.info('Activities per ObjectID filtered successfully.')

    return data_filter_activities, num_activities_per_obj_id

@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
 ## use idx pairs to get idx values of series


-def get_timeline_candidates_index(
+def _get_timeline_candidates_index(
    data: DataFrame,
    num_activities_per_obj_id: Series,
    *,
@ -161,7 +204,7 @@ def get_timeline_candidates_index(


 # TODO: check application for duplicate removal
-def transform_timeline_candidates(
+def _transform_timeline_candidates(
    candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
 ) -> TimelineCandidates:
    """function to build a mapping of ObjectIDs to their respective collection of
@ -200,7 +243,7 @@ def transform_timeline_candidates(
    return candidates_by_obj_id


-def map_obj_id_to_texts(
+def _map_obj_id_to_texts(
    data: DataFrame,
    feature_obj_id: str = 'ObjektID',
 ) -> dict[ObjectID, str]:
@ -229,7 +272,7 @@ def get_timeline_candidates(
    model_input_feature: str = 'nlp_model_input',
 ) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
    logger.info('Obtaining timeline candidates...')
-    candidates = get_timeline_candidates_index(
+    candidates = _get_timeline_candidates_index(
        data=data,
        num_activities_per_obj_id=num_activities_per_obj_id,
        model=model,
@ -237,14 +280,52 @@ def get_timeline_candidates(
        feature_obj_id=feature_obj_id,
        model_input_feature=model_input_feature,
    )
-    tl_candidates = transform_timeline_candidates(candidates)
+    tl_candidates = _transform_timeline_candidates(candidates)
    logger.info('Timeline candidates obtained successfully.')
    # text mapping to obtain object descriptors
    logger.info('Mapping ObjectIDs to their respective text descriptor...')
-    map_obj_text = map_obj_id_to_texts(
+    map_obj_text = _map_obj_id_to_texts(
        data=data,
        feature_obj_id=feature_obj_id,
    )
    logger.info('ObjectIDs successfully mapped to text descriptors.')

    return tl_candidates, map_obj_text
+
+
+# ** Postprocessing
+# filter original dataset for a batch of timeline candidates
+def filter_timeline_cands(
+    data: DataFrame,
+    cands: TimelineCandidates,
+    obj_id: ObjectID,
+    entry_idx: int,
+    sort_feature: str = 'ErstellungsDatum',
+) -> DataFrameTLFiltered:
+    data = data.copy()
+    cands_for_obj_id = cands[obj_id]
+    cands_choice = cands_for_obj_id[entry_idx]
+    data = data.loc[list(cands_choice)].sort_values(
+        by=sort_feature,
+        ascending=True,
+    )
+
+    return data
+
+
+def calc_delta_to_next_failure(
+    data: DataFrameTLFiltered,
+    date_feature: str = 'ErstellungsDatum',
+    name_delta_feature: str = 'delta_to_next_failure',
+    convert_to_days: bool = True,
+) -> DataFrameTLFiltered:
+    data = data.copy()
+    last_val = data[date_feature].iat[-1]
+    shifted = data[date_feature].shift(-1, fill_value=last_val)
+    data[name_delta_feature] = shifted - data[date_feature]
+    data = data.sort_values(by=name_delta_feature, ascending=False)
+
+    if convert_to_days:
+        data[name_delta_feature] = data[name_delta_feature].dt.days
+
+    return data
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
    'isDeterministic': True,
    'singlePartition': False,
 }
+CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
 CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
 # name for property, on which selection is done
 CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
 CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
 CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2

-# ** time analysis.uniqueness
+# ** time_analysis.uniqueness
 THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
    'threshold_unique_texts'
 ]
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
    'criterion_feature'
 ]
 FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
+# ** time_analysis.preparation
+NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
+# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
+NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 # ** time_analysis.model_input
 MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['input_features']
--- a/src/lang_main/cytoscape_config/template_test.cys
+++ b/src/lang_main/cytoscape_config/template_test.cys
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -3,15 +3,19 @@ from pathlib import Path
 from lang_main.analysis import graphs
 from lang_main.analysis.preprocessing import (
    analyse_feature,
-    clean_string_slim,
-    entry_wise_cleansing,
    load_raw_data,
    merge_similarity_dupl,
    numeric_pre_filter_feature,
    remove_duplicates,
    remove_NA,
 )
+from lang_main.analysis.shared import (
+    clean_string_slim,
+    entry_wise_cleansing,
+)
 from lang_main.analysis.timeline import (
+    calc_delta_to_repair,
+    cleanup_descriptions,
    filter_activities_per_obj_id,
    generate_model_input,
    get_timeline_candidates,
@ -25,6 +29,7 @@ from lang_main.constants import (
    DATE_COLS,
    FEATURE_NAME_OBJ_ID,
    MODEL_INPUT_FEATURES,
+    NAME_DELTA_FEAT_TO_REPAIR,
    SAVE_PATH_FOLDER,
    SPCY_MODEL,
    STFR_MODEL,
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
    pipe_target_feat.add(
        entry_wise_cleansing,
        {
-            'target_feature': 'VorgangsBeschreibung',
+            'target_feature': ('VorgangsBeschreibung',),
            'cleansing_func': clean_string_slim,
        },
        save_result=True,
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
        graphs.pipe_add_graph_metrics,
        save_result=save_result,
        filename=exit_point,
-        # filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
    )

    return pipe_graph_rescaling
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
 # ** timeline analysis
 def build_timeline_pipe() -> Pipeline:
    pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
+    pipe_timeline.add(
+        cleanup_descriptions,
+        {
+            'properties': ['ErledigungsBeschreibung'],
+        },
+    )
+    pipe_timeline.add(
+        calc_delta_to_repair,
+        {
+            'date_feature_start': 'ErstellungsDatum',
+            'date_feature_end': 'ErledigungsDatum',
+            'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
+            'convert_to_days': True,
+        },
+        save_result=True,
+        filename=EntryPoints.TIMELINE_POST,
+    )
    pipe_timeline.add(
        remove_non_relevant_obj_ids,
        {
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
            'model_input_feature': 'nlp_model_input',
        },
        save_result=True,
-        filename=EntryPoints.TIMELINE_POST,
+        filename=EntryPoints.TIMELINE_CANDS,
    )

    return pipe_timeline
--- a/src/lang_main/render/cytoscape.py
+++ b/src/lang_main/render/cytoscape.py
@ -16,6 +16,7 @@ from lang_main.constants import (
    CYTO_LAYOUT_PROPERTIES,
    CYTO_NUMBER_SUBGRAPHS,
    CYTO_PATH_STYLESHEET,
+    CYTO_SANDBOX_NAME,
    CYTO_SELECTION_PROPERTY,
    CYTO_STYLESHEET_NAME,
    PROPERTY_NAME_DEGREE_WEIGHTED,
@ -56,6 +57,8 @@ def verify_connection():
 def import_to_cytoscape(
    graph: DiGraph | Graph,
    network_name: str = CYTO_BASE_NETWORK_NAME,
+    sandbox_name: str = CYTO_SANDBOX_NAME,
+    reinitialise_sandbox: bool = True,
 ) -> None:
    """Cytoscape: import NetworkX graph as new network collection

@ -66,6 +69,12 @@ def import_to_cytoscape(
    """
    logger.debug('Checking Cytoscape connection...')
    verify_connection()
+    logger.debug('Setting Cytoscape sandbox...')
+    p4c.sandbox_set(
+        sandbox_name=sandbox_name,
+        reinitialize=reinitialise_sandbox,
+        copy_samples=False,
+    )
    logger.debug('Importing to and analysing network in Cytoscape...')
    p4c.delete_all_networks()
    p4c.create_network_from_networkx(
@ -122,6 +131,7 @@ def export_network_to_image(
    filetype: CytoExportFileTypes = 'SVG',
    network_name: str = CYTO_BASE_NETWORK_NAME,
    pdf_export_page_size: CytoExportPageSizes = 'A4',
+    sandbox_name: str = CYTO_SANDBOX_NAME,
 ) -> None:
    """Cytoscape: export current selected view as image

@ -140,14 +150,17 @@ def export_network_to_image(
    logger.debug('Exporting image to file...')
    if not target_folder.exists():
        target_folder.mkdir(parents=True)
-    file_pth = target_folder / filename
+    dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')

    text_as_font = True
    if filetype == 'SVG':
        text_as_font = False

+    # image is generated in sandbox directory and transferred to target destination
+    # (preparation for remote instances of Cytoscape)
+    # TODO close non-necessary windows before image display
    p4c.export_image(
-        filename=str(file_pth),
+        filename=filename,
        type=filetype,
        network=network_name,
        overwrite_file=True,
@ -155,7 +168,24 @@ def export_network_to_image(
        export_text_as_font=text_as_font,
        page_size=pdf_export_page_size,
    )
-    logger.debug('Exporting image to file successful.')
+    # TODO change back to Cytoscape 3.10 and above
+    # TODO remove if Cytoscape >= 3.10.* is running in container
+    # p4c.export_image(
+    #     filename=filename,
+    #     type=filetype,
+    #     network=network_name,
+    #     overwrite_file=True,
+    # )
+    logger.debug('Exported image to sandbox.')
+    logger.debug('Transferring image from sandbox to target destination...')
+    sandbox_filename = f'{filename}.{filetype.lower()}'
+    p4c.sandbox_get_from(
+        source_file=sandbox_filename,
+        dest_file=str(dst_file_pth),
+        overwrite=True,
+        sandbox_name=sandbox_name,
+    )
+    logger.debug('Transfer of image from sandbox to target destination successful.')


 def layout_network(
@ -192,6 +222,7 @@ def apply_style_to_network(
    node_size_property: str = 'node_selection',
    min_node_size: int = 15,
    max_node_size: int = 40,
+    sandbox_name: str = CYTO_SANDBOX_NAME,
 ) -> None:
    """Cytoscape: apply a chosen Cytoscape style to the defined network

@ -221,7 +252,16 @@ def apply_style_to_network(
            raise FileNotFoundError(
                f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
            )
-        p4c.import_visual_styles(str(pth_to_stylesheet))
+        # send to sandbox
+        sandbox_filename = pth_to_stylesheet.name
+        p4c.sandbox_send_to(
+            source_file=pth_to_stylesheet,
+            dest_file=sandbox_filename,
+            overwrite=True,
+            sandbox_name=sandbox_name,
+        )
+        # load stylesheet
+        p4c.import_visual_styles(sandbox_filename)

    p4c.set_visual_style(style_name, network=network_name)
    # node size mapping, only if needed property is available
@ -242,6 +282,7 @@ def apply_style_to_network(
        default_number=min_node_size,
    )
    p4c.set_node_size_mapping(**node_size_map)
+    # TODO removal
    # else:
    #     node_table = p4c.get_table_columns(table='node', network=network_name)
    #     nodes_SUID = node_table['SUID'].to_list()
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,5 +1,7 @@
 import enum
+from collections.abc import Hashable
 from typing import (
+    Any,
    Literal,
    Required,
    TypeAlias,
@ -7,6 +9,7 @@ from typing import (
 )

 import numpy as np
+from pandas import DataFrame
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor

@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
 class EntryPoints(enum.StrEnum):
    TIMELINE = 'TIMELINE'
    TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
+    TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
    TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
    TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
    TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
 PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
 Embedding: TypeAlias = SpacyDoc | Tensor
+DataFrameTLFiltered: TypeAlias = DataFrame

 # ** graphs
 NodeTitle: TypeAlias = str
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):

 # ** timeline
 TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
+
+
+# ** Dash (Dashboard)
+HTMLTable: TypeAlias = list[dict[Hashable, Any]]
+HTMLColumns: TypeAlias = list[dict[str, str]]
--- a/test-notebooks/misc.ipynb
+++ b/test-notebooks/misc.ipynb