sandboxing

2024-08-05 08:43:45 +02:00 · 2024-08-05 08:43:45 +02:00 · 3f58a14852
commit 3f58a14852
parent 9197146d2c
10 changed files with 2362 additions and 283 deletions
--- a/scripts/dash_timeline_static.py
+++ b/scripts/dash_timeline_static.py
@ -1,9 +1,11 @@
 import time
 import webbrowser
-from pathlib import Path
+from collections.abc import Collection, Iterable
 from threading import Thread
 from typing import Any, Final, cast
 import pandas as pd
 # import dash_cytoscape as cyto
 import plotly.express as px
 from dash import (
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure
 import lang_main.io
 from lang_main.analysis import graphs, tokens
-from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL
+from lang_main.analysis.timeline import (
    calc_delta_to_next_failure,
    filter_timeline_cands,
 )
 from lang_main.constants import (
    NAME_DELTA_FEAT_TO_NEXT_FAILURE,
    NAME_DELTA_FEAT_TO_REPAIR,
    SAVE_PATH_FOLDER,
    SPCY_MODEL,
 )
 from lang_main.errors import EmptyEdgesError, EmptyGraphError
 from lang_main.pipelines.predefined import (
    build_tk_graph_render_pipe,
    build_tk_graph_rescaling_pipe,
 )
-from lang_main.types import EntryPoints, ObjectID, TimelineCandidates
+from lang_main.types import (
    DataFrameTLFiltered,
    EntryPoints,
    HTMLColumns,
    HTMLTable,
    ObjectID,
    TimelineCandidates,
 )
 # ** data
 # p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
-p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
+p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
 (data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
 # data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
 # p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
-p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
+p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
 cands, texts = cast(
    tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
 )
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
    file_ext='.svg',
 )
-
+# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
-TABLE_FEATS: Final[list[str]] = [
+TABLE_FEATS_OVERVIEW: Final[list[str]] = [
    'ErstellungsDatum',
    'ErledigungsDatum',
    NAME_DELTA_FEAT_TO_REPAIR,
    'VorgangsTypName',
    'VorgangsBeschreibung',
    'ErledigungsBeschreibung',
 ]
 TABLE_FEATS_DATES: Final[list[str]] = [
    'ErstellungsDatum',
    'ErledigungsDatum',
 ]
 TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
    'ErstellungsDatum',
    'ErledigungsDatum',
    'VorgangsTypName',
    'VorgangsBeschreibung',
    'ErledigungsBeschreibung',
    NAME_DELTA_FEAT_TO_NEXT_FAILURE,
 ]
 # ** figure config
 MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'ErledigungsDatum': '|%d.%m.%Y',
    'VorgangsBeschreibung': True,
    'ErledigungsBeschreibung': True,
 }
 HOVER_DATA_DELTA: Final[dict[str, Any]] = {
    'ErstellungsDatum': '|%d.%m.%Y',
    'ErledigungsDatum': '|%d.%m.%Y',
    'VorgangsDatum': '|%d.%m.%Y',
-    'delta': True,
+    NAME_DELTA_FEAT_TO_REPAIR: True,
    'VorgangsBeschreibung': True,
    'ErledigungsBeschreibung': True,
 }
 # ** graph
@ -136,10 +167,10 @@ graph_layout = html.Div(
                            html.Img(
                                id='static-graph-img',
                                alt='static rendered graph',
-                                # style={
+                                style={
-                                #     'width': 'auto',
+                                    'width': 'auto',
-                                #     'height': 'auto',
+                                    'height': 'auto',
-                                # },
+                                },
                            ),
                            html.P(id='info-graph-errors', children=[]),
                        ],
@ -186,7 +217,27 @@ app.layout = html.Div(
            ]
        ),
        html.Div(
-            [dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
+            children=[
                html.Div(
                    [
                        html.H5('Überblick ähnlicher Vorgänge'),
                        dash_table.DataTable(id='table-candidates'),
                    ],
                    style={'paddingBottom': '1em'},
                ),
                html.Div(
                    [
                        html.H5(
                            (
                                'Maßnahmen sortiert nach längstem Zeitraum '
                                'bis zum nächsten Ereignis'
                            )
                        ),
                        dash_table.DataTable(id='table-best-actions'),
                    ]
                ),
            ],
            style={'marginBottom': '2em', 'padding': '2em'},
        ),
        graph_layout,
    ],
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):
 # ** helpers to filter DataFrame
-def pre_filter_data(
+def filter_candidates(
    data: DataFrame,
    idx: int,
    obj_id: ObjectID,
-) -> DataFrame:
+) -> DataFrameTLFiltered:
    # assert correct data type because of Dash
    idx = int(idx)
    obj_id = int(obj_id)
-    # data = data.copy()
+
-    cands_for_obj_id = cands[obj_id]
+    data = filter_timeline_cands(
-    cands_choice = cands_for_obj_id[int(idx) - 1]
+        data=data,
-    # data
+        cands=cands,
-    data = data.loc[list(cands_choice)].sort_index()  # type: ignore
+        obj_id=obj_id,
-    data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum']
+        entry_idx=(idx - 1),  # idx in Dashboard starts with 1
-    data['delta'] = data['delta'].dt.days
+    )
    return data
@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
    obj_text = texts[obj_id]
    title_occurrences = f'HObjektText: {obj_text}'
    title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    df = filter_candidates(data, idx=index, obj_id=obj_id)
    # figure
    fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
-    fig_delta = fig_timeline_delta(df, title_delta)
+    fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)
    return fig_occurrences, fig_delta
@ -293,11 +345,12 @@ def fig_timeline_occurrences(
 def fig_timeline_delta(
    df: DataFrame,
    title: str,
    delta_feature: str,
 ) -> Figure:
    fig = px.scatter(
        data_frame=df,
        x='ErstellungsDatum',
-        y='delta',
+        y=delta_feature,
        title=title,
        hover_data=HOVER_DATA_DELTA,
    )
@ -309,25 +362,77 @@ def fig_timeline_delta(
    return fig
 def transform_to_HTML_table(
    data: DataFrame,
    target_features: Collection[str],
    date_cols: Iterable[str] | None = None,
    sorting_feature: str | None = None,
    sorting_ascending: bool = True,
 ) -> tuple[HTMLColumns, HTMLTable]:
    target_features = list(target_features)
    data = data.copy()
    data = data.filter(items=target_features, axis=1)
    if sorting_feature is not None:
        data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
    if date_cols is not None:
        for col in date_cols:
            data[col] = data[col].dt.strftime(r'%Y-%m-%d')
    columns = [{'name': col, 'id': col} for col in data.columns]
    table_data = data.to_dict('records')
    return columns, table_data
 # 'table-best-actions'
 # ** HTML table
@callback(
-    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
+    [
        Output('table-candidates', 'columns'),
        Output('table-candidates', 'data'),
        Output('table-best-actions', 'columns'),
        Output('table-best-actions', 'data'),
    ],
    Input('selector-candidates', 'value'),
    State('selector-obj_id', 'value'),
    prevent_initial_call=True,
 )
-def update_table_candidates(index, obj_id):
+def update_tables_candidates(
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    index,
-    df = df.filter(items=TABLE_FEATS, axis=1).sort_values(
+    obj_id,
-        by='ErstellungsDatum', ascending=True
+) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
    cands = filter_candidates(data, idx=index, obj_id=obj_id)
    overview_cols, overview_table = transform_to_HTML_table(
        data=cands,
        target_features=TABLE_FEATS_OVERVIEW,
        date_cols=TABLE_FEATS_DATES,
        sorting_feature='ErstellungsDatum',
        sorting_ascending=True,
    )
-    cols = [{'name': i, 'id': i} for i in df.columns]
+    # df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
-    # convert dates to strings
+    #     by='ErstellungsDatum', ascending=True
-    for col in TABLE_FEATS_DATES:
+    # )
-        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
+    # cols = [{'name': i, 'id': i} for i in df.columns]
    # # convert dates to strings
    # for col in TABLE_FEATS_DATES:
    #     df[col] = df[col].dt.strftime(r'%Y-%m-%d')
-    table_data = df.to_dict('records')
+    # table_data = df.to_dict('records')
-    return table_data, cols
+
    cands_best_actions = calc_delta_to_next_failure(
        data=cands,
        date_feature='ErstellungsDatum',
        name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
    )
    best_actions_cols, best_actions_table = transform_to_HTML_table(
        data=cands_best_actions,
        target_features=TABLE_FEATS_BEST_ACTIONS,
        date_cols=TABLE_FEATS_DATES,
    )
    return overview_cols, overview_table, best_actions_cols, best_actions_table
 # ** graph callbacks
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
 def display_candidates_as_graph(index, obj_id):
    error_msg = ''
    t1 = time.perf_counter()
-    df = pre_filter_data(data, idx=index, obj_id=obj_id)
+    df = filter_candidates(data, idx=index, obj_id=obj_id)
    t2 = time.perf_counter()
    print(f'Time for filtering: {t2 - t1} s')
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,9 +1,8 @@
-import re
+from collections.abc import Collection
 from collections.abc import Iterable
 from itertools import combinations
 from math import factorial
 from pathlib import Path
-from typing import Callable, cast
+from typing import cast
 import numpy as np
 import pandas as pd
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import Pipeline
 from lang_main.types import Embedding, PandasIndex
-# ** RE patterns
+# TODO removal
-pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
+# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
-pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
+# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
-pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
-pattern_whitespace = re.compile(r'[ ]{2,}')
+# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
 # pattern_whitespace = re.compile(r'[ ]{2,}')
 # ** (1) dataset preparation: loading and simple preprocessing
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
 # duplicate cleansing based on all properties
 def load_raw_data(
    path: Path,
-    date_cols: Iterable[str] = (
+    date_cols: Collection[str] = (
        'VorgangsDatum',
        'ErledigungsDatum',
        'Arbeitsbeginn',
@ -50,7 +50,7 @@ def load_raw_data(
    ----------
    path : str
        path to dataset file, usually CSV file
-    date_cols : list[str], optional
+    date_cols : Collection[str], optional
        columns which contain dates and are parsed as such,
        by default (
            'VorgangsDatum',
@ -129,9 +129,7 @@ def remove_duplicates(
 def remove_NA(
    data: DataFrame,
-    target_features: list[str] = [
+    target_features: Collection[str] = ('VorgangsBeschreibung',),
        'VorgangsBeschreibung',
    ],
 ) -> tuple[DataFrame]:
    """function to drop NA entries based on a subset of features to be analysed
@ -139,14 +137,15 @@ def remove_NA(
    ----------
    data : DataFrame
        standard IHM dataset, perhaps pre-cleaned
-    target_features : list[str], optional
+    target_features : Collection[str], optional
-        subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
+        subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
    Returns
    -------
    DataFrame
        dataset with removed NA entries for given subset of features
    """
    target_features = list(target_features)
    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
    logger.info(
        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
@ -156,46 +155,7 @@ def remove_NA(
 # ** (2) entry-based cleansing
-# following functions clean and prepare specific entries, not whole dataset
+# ** moved to module ``lang_main.analysis.shared``
 def clean_string_slim(string: str) -> str:
    """mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features
    Parameters
    ----------
    string : str
        dataset entry feature
    Returns
    -------
    str
        cleaned entry
    """
    # remove special chars
    string = pattern_special_chars.sub(' ', string)
    string = pattern_repeated_chars.sub(r'\1', string)
    # string = pattern_dates.sub('', string)
    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
    string = string.strip()
    return string
 def entry_wise_cleansing(
    data: DataFrame,
    target_feature: str,
    cleansing_func: Callable[[str], str],
 ) -> tuple[DataFrame]:
    # apply given cleansing function to target feature
    data[target_feature] = data[target_feature].map(cleansing_func)
    logger.info(
        ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
        cleansing_func.__name__,
        target_feature,
    )
    return (data,)
 # ** in-depth analysis of one feature
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -1,4 +1,5 @@
-from collections.abc import Iterable, Iterator
+import re
 from collections.abc import Callable, Collection, Iterable, Iterator
 from typing import cast
 import networkx as nx
@ -7,14 +8,70 @@ import numpy.typing as npt
 import sentence_transformers
 import sentence_transformers.util
 from networkx import Graph
-from pandas import Series
+from pandas import DataFrame, Series
 from sentence_transformers import SentenceTransformer
 from torch import Tensor
 from lang_main.analysis.graphs import get_graph_metadata, update_graph
 from lang_main.loggers import logger_preprocess as logger
 from lang_main.types import PandasIndex
 # ** RE patterns
 pattern_escape_newline = re.compile(r'[\n]+')
 pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
 pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
 pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
 pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
 pattern_whitespace = re.compile(r'[ ]{2,}')
 # ** RE applications
 # following functions clean and prepare specific entries, not whole datasets
 def clean_string_slim(string: str) -> str:
    """mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features
    Parameters
    ----------
    string : str
        dataset entry feature
    Returns
    -------
    str
        cleaned entry
    """
    # remove special chars
    string = pattern_escape_newline.sub('. ', string)
    string = pattern_escape_seq.sub(' ', string)
    string = pattern_repeated_chars.sub('', string)
    # string = pattern_dates.sub('', string)
    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
    string = string.strip()
    return string
 # ** dataset cleansing
 def entry_wise_cleansing(
    data: DataFrame,
    target_features: Collection[str],
    cleansing_func: Callable[[str], str] = clean_string_slim,
 ) -> tuple[DataFrame]:
    # apply given cleansing function to target feature
    target_features = list(target_features)
    data[target_features] = data[target_features].map(cleansing_func)
    logger.info(
        ('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
        cleansing_func.__name__,
        target_features,
    )
    return (data,)
 # ** similarities
 def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -1,4 +1,4 @@
-from collections.abc import Iterable, Iterator
+from collections.abc import Collection, Iterable, Iterator
 from typing import cast
 from pandas import DataFrame, Series
@ -7,14 +7,21 @@ from tqdm.auto import tqdm  # TODO: check deletion
 from lang_main.analysis.shared import (
    candidates_by_index,
    entry_wise_cleansing,
    pattern_escape_seq_sentences,
    similar_index_connection_graph,
    similar_index_groups,
 )
 from lang_main.loggers import logger_timeline as logger
-from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
+from lang_main.types import (
    DataFrameTLFiltered,
    ObjectID,
    PandasIndex,
    TimelineCandidates,
 )
-def non_relevant_obj_ids(
+def _non_relevant_obj_ids(
    data: DataFrame,
    thresh_unique_feat_per_id: int,
    *,
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
 ) -> tuple[DataFrame]:
-    logger.info('Removing non-relevant ObjectIDs from dataset')
+    logger.info('Removing non-relevant ObjectIDs from dataset...')
    data = data.copy()
-    ids_to_ignore = non_relevant_obj_ids(
+    ids_to_ignore = _non_relevant_obj_ids(
        data=data,
        thresh_unique_feat_per_id=thresh_unique_feat_per_id,
        feature_uniqueness=feature_uniqueness,
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
    # only retain entries with ObjectIDs not in IDs to ignore
    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
    logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
-    logger.info('Non-relevant ObjectIDs removed successfully')
+    logger.info('Non-relevant ObjectIDs removed successfully.')
    return (data,)
 def cleanup_descriptions(
    data: DataFrame,
    properties: Collection[str] = (
        'VorgangsBeschreibung',
        'ErledigungsBeschreibung',
    ),
 ) -> tuple[DataFrame]:
    logger.info('Cleaning necessary descriptions...')
    data = data.copy()
    features = list(properties)
    data[features] = data[features].fillna('N.V.')
    (data,) = entry_wise_cleansing(data, target_features=features)
    logger.info('Cleansing successful.')
    return (data.copy(),)
 def calc_delta_to_repair(
    data: DataFrame,
    date_feature_start: str = 'ErstellungsDatum',
    date_feature_end: str = 'ErledigungsDatum',
    name_delta_feature: str = 'delta_to_repair',
    convert_to_days: bool = True,
 ) -> tuple[DataFrame]:
    logger.info('Calculating time differences between start and end of operations...')
    data = data.copy()
    data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
    if convert_to_days:
        data[name_delta_feature] = data[name_delta_feature].dt.days
    logger.info('Calculation successful.')
    return (data,)
@ -75,7 +118,7 @@ def generate_model_input(
        'VorgangsBeschreibung',
    ),
 ) -> tuple[DataFrame]:
-    logger.info('Generating concatenation of model input features')
+    logger.info('Generating concatenation of model input features...')
    data = data.copy()
    model_input_features = list(model_input_features)
    input_features = data[model_input_features].fillna('').astype(str)
@ -83,7 +126,7 @@ def generate_model_input(
        lambda x: ' - '.join(x),
        axis=1,
    )
-    logger.info('Model input generated successfully')
+    logger.info('Model input generated successfully.')
    return (data,)
@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
 ) -> tuple[DataFrame, Series]:
    data = data.copy()
    # filter only relevant activities count occurrences for each ObjectID
-    logger.info('Filtering activities per ObjectID')
+    logger.info('Filtering activities per ObjectID...')
    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
    data_filter_activities = data.loc[filt_rel_activities].copy()
    num_activities_per_obj_id = cast(
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(
    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
-    logger.info('Activities per ObjectID filtered successfully')
+    logger.info('Activities per ObjectID filtered successfully.')
    return data_filter_activities, num_activities_per_obj_id
@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
 ## use idx pairs to get idx values of series
-def get_timeline_candidates_index(
+def _get_timeline_candidates_index(
    data: DataFrame,
    num_activities_per_obj_id: Series,
    *,
@ -161,7 +204,7 @@ def get_timeline_candidates_index(
 # TODO: check application for duplicate removal
-def transform_timeline_candidates(
+def _transform_timeline_candidates(
    candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
 ) -> TimelineCandidates:
    """function to build a mapping of ObjectIDs to their respective collection of
@ -200,7 +243,7 @@ def transform_timeline_candidates(
    return candidates_by_obj_id
-def map_obj_id_to_texts(
+def _map_obj_id_to_texts(
    data: DataFrame,
    feature_obj_id: str = 'ObjektID',
 ) -> dict[ObjectID, str]:
@ -229,7 +272,7 @@ def get_timeline_candidates(
    model_input_feature: str = 'nlp_model_input',
 ) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
    logger.info('Obtaining timeline candidates...')
-    candidates = get_timeline_candidates_index(
+    candidates = _get_timeline_candidates_index(
        data=data,
        num_activities_per_obj_id=num_activities_per_obj_id,
        model=model,
@ -237,14 +280,52 @@ def get_timeline_candidates(
        feature_obj_id=feature_obj_id,
        model_input_feature=model_input_feature,
    )
-    tl_candidates = transform_timeline_candidates(candidates)
+    tl_candidates = _transform_timeline_candidates(candidates)
    logger.info('Timeline candidates obtained successfully.')
    # text mapping to obtain object descriptors
    logger.info('Mapping ObjectIDs to their respective text descriptor...')
-    map_obj_text = map_obj_id_to_texts(
+    map_obj_text = _map_obj_id_to_texts(
        data=data,
        feature_obj_id=feature_obj_id,
    )
    logger.info('ObjectIDs successfully mapped to text descriptors.')
    return tl_candidates, map_obj_text
 # ** Postprocessing
 # filter original dataset for a batch of timeline candidates
 def filter_timeline_cands(
    data: DataFrame,
    cands: TimelineCandidates,
    obj_id: ObjectID,
    entry_idx: int,
    sort_feature: str = 'ErstellungsDatum',
 ) -> DataFrameTLFiltered:
    data = data.copy()
    cands_for_obj_id = cands[obj_id]
    cands_choice = cands_for_obj_id[entry_idx]
    data = data.loc[list(cands_choice)].sort_values(
        by=sort_feature,
        ascending=True,
    )
    return data
 def calc_delta_to_next_failure(
    data: DataFrameTLFiltered,
    date_feature: str = 'ErstellungsDatum',
    name_delta_feature: str = 'delta_to_next_failure',
    convert_to_days: bool = True,
 ) -> DataFrameTLFiltered:
    data = data.copy()
    last_val = data[date_feature].iat[-1]
    shifted = data[date_feature].shift(-1, fill_value=last_val)
    data[name_delta_feature] = shifted - data[date_feature]
    data = data.sort_values(by=name_delta_feature, ascending=False)
    if convert_to_days:
        data[name_delta_feature] = data[name_delta_feature].dt.days
    return data
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
    'isDeterministic': True,
    'singlePartition': False,
 }
 CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
 CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
 # name for property, on which selection is done
 CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
 CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
 CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
-# ** time analysis.uniqueness
+# ** time_analysis.uniqueness
 THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
    'threshold_unique_texts'
 ]
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
    'criterion_feature'
 ]
 FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
 # ** time_analysis.preparation
 NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
 # NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
 NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 # ** time_analysis.model_input
 MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['input_features']
--- a/src/lang_main/cytoscape_config/template_test.cys
+++ b/src/lang_main/cytoscape_config/template_test.cys
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -3,15 +3,19 @@ from pathlib import Path
 from lang_main.analysis import graphs
 from lang_main.analysis.preprocessing import (
    analyse_feature,
    clean_string_slim,
    entry_wise_cleansing,
    load_raw_data,
    merge_similarity_dupl,
    numeric_pre_filter_feature,
    remove_duplicates,
    remove_NA,
 )
 from lang_main.analysis.shared import (
    clean_string_slim,
    entry_wise_cleansing,
 )
 from lang_main.analysis.timeline import (
    calc_delta_to_repair,
    cleanup_descriptions,
    filter_activities_per_obj_id,
    generate_model_input,
    get_timeline_candidates,
@ -25,6 +29,7 @@ from lang_main.constants import (
    DATE_COLS,
    FEATURE_NAME_OBJ_ID,
    MODEL_INPUT_FEATURES,
    NAME_DELTA_FEAT_TO_REPAIR,
    SAVE_PATH_FOLDER,
    SPCY_MODEL,
    STFR_MODEL,
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
    pipe_target_feat.add(
        entry_wise_cleansing,
        {
-            'target_feature': 'VorgangsBeschreibung',
+            'target_feature': ('VorgangsBeschreibung',),
            'cleansing_func': clean_string_slim,
        },
        save_result=True,
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
        graphs.pipe_add_graph_metrics,
        save_result=save_result,
        filename=exit_point,
        # filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
    )
    return pipe_graph_rescaling
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
 # ** timeline analysis
 def build_timeline_pipe() -> Pipeline:
    pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
    pipe_timeline.add(
        cleanup_descriptions,
        {
            'properties': ['ErledigungsBeschreibung'],
        },
    )
    pipe_timeline.add(
        calc_delta_to_repair,
        {
            'date_feature_start': 'ErstellungsDatum',
            'date_feature_end': 'ErledigungsDatum',
            'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
            'convert_to_days': True,
        },
        save_result=True,
        filename=EntryPoints.TIMELINE_POST,
    )
    pipe_timeline.add(
        remove_non_relevant_obj_ids,
        {
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
            'model_input_feature': 'nlp_model_input',
        },
        save_result=True,
-        filename=EntryPoints.TIMELINE_POST,
+        filename=EntryPoints.TIMELINE_CANDS,
    )
    return pipe_timeline
--- a/src/lang_main/render/cytoscape.py
+++ b/src/lang_main/render/cytoscape.py
@ -16,6 +16,7 @@ from lang_main.constants import (
    CYTO_LAYOUT_PROPERTIES,
    CYTO_NUMBER_SUBGRAPHS,
    CYTO_PATH_STYLESHEET,
    CYTO_SANDBOX_NAME,
    CYTO_SELECTION_PROPERTY,
    CYTO_STYLESHEET_NAME,
    PROPERTY_NAME_DEGREE_WEIGHTED,
@ -56,6 +57,8 @@ def verify_connection():
 def import_to_cytoscape(
    graph: DiGraph | Graph,
    network_name: str = CYTO_BASE_NETWORK_NAME,
    sandbox_name: str = CYTO_SANDBOX_NAME,
    reinitialise_sandbox: bool = True,
 ) -> None:
    """Cytoscape: import NetworkX graph as new network collection
@ -66,6 +69,12 @@ def import_to_cytoscape(
    """
    logger.debug('Checking Cytoscape connection...')
    verify_connection()
    logger.debug('Setting Cytoscape sandbox...')
    p4c.sandbox_set(
        sandbox_name=sandbox_name,
        reinitialize=reinitialise_sandbox,
        copy_samples=False,
    )
    logger.debug('Importing to and analysing network in Cytoscape...')
    p4c.delete_all_networks()
    p4c.create_network_from_networkx(
@ -122,6 +131,7 @@ def export_network_to_image(
    filetype: CytoExportFileTypes = 'SVG',
    network_name: str = CYTO_BASE_NETWORK_NAME,
    pdf_export_page_size: CytoExportPageSizes = 'A4',
    sandbox_name: str = CYTO_SANDBOX_NAME,
 ) -> None:
    """Cytoscape: export current selected view as image
@ -140,14 +150,17 @@ def export_network_to_image(
    logger.debug('Exporting image to file...')
    if not target_folder.exists():
        target_folder.mkdir(parents=True)
-    file_pth = target_folder / filename
+    dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')
    text_as_font = True
    if filetype == 'SVG':
        text_as_font = False
    # image is generated in sandbox directory and transferred to target destination
    # (preparation for remote instances of Cytoscape)
    # TODO close non-necessary windows before image display
    p4c.export_image(
-        filename=str(file_pth),
+        filename=filename,
        type=filetype,
        network=network_name,
        overwrite_file=True,
@ -155,7 +168,24 @@ def export_network_to_image(
        export_text_as_font=text_as_font,
        page_size=pdf_export_page_size,
    )
-    logger.debug('Exporting image to file successful.')
+    # TODO change back to Cytoscape 3.10 and above
    # TODO remove if Cytoscape >= 3.10.* is running in container
    # p4c.export_image(
    #     filename=filename,
    #     type=filetype,
    #     network=network_name,
    #     overwrite_file=True,
    # )
    logger.debug('Exported image to sandbox.')
    logger.debug('Transferring image from sandbox to target destination...')
    sandbox_filename = f'{filename}.{filetype.lower()}'
    p4c.sandbox_get_from(
        source_file=sandbox_filename,
        dest_file=str(dst_file_pth),
        overwrite=True,
        sandbox_name=sandbox_name,
    )
    logger.debug('Transfer of image from sandbox to target destination successful.')
 def layout_network(
@ -192,6 +222,7 @@ def apply_style_to_network(
    node_size_property: str = 'node_selection',
    min_node_size: int = 15,
    max_node_size: int = 40,
    sandbox_name: str = CYTO_SANDBOX_NAME,
 ) -> None:
    """Cytoscape: apply a chosen Cytoscape style to the defined network
@ -221,7 +252,16 @@ def apply_style_to_network(
            raise FileNotFoundError(
                f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
            )
-        p4c.import_visual_styles(str(pth_to_stylesheet))
+        # send to sandbox
        sandbox_filename = pth_to_stylesheet.name
        p4c.sandbox_send_to(
            source_file=pth_to_stylesheet,
            dest_file=sandbox_filename,
            overwrite=True,
            sandbox_name=sandbox_name,
        )
        # load stylesheet
        p4c.import_visual_styles(sandbox_filename)
    p4c.set_visual_style(style_name, network=network_name)
    # node size mapping, only if needed property is available
@ -242,6 +282,7 @@ def apply_style_to_network(
        default_number=min_node_size,
    )
    p4c.set_node_size_mapping(**node_size_map)
    # TODO removal
    # else:
    #     node_table = p4c.get_table_columns(table='node', network=network_name)
    #     nodes_SUID = node_table['SUID'].to_list()
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,5 +1,7 @@
 import enum
 from collections.abc import Hashable
 from typing import (
    Any,
    Literal,
    Required,
    TypeAlias,
@ -7,6 +9,7 @@ from typing import (
 )
 import numpy as np
 from pandas import DataFrame
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor
@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
 class EntryPoints(enum.StrEnum):
    TIMELINE = 'TIMELINE'
    TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
    TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
    TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
    TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
    TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
 PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
 Embedding: TypeAlias = SpacyDoc | Tensor
 DataFrameTLFiltered: TypeAlias = DataFrame
 # ** graphs
 NodeTitle: TypeAlias = str
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):
 # ** timeline
 TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
 # ** Dash (Dashboard)
 HTMLTable: TypeAlias = list[dict[Hashable, Any]]
 HTMLColumns: TypeAlias = list[dict[str, str]]
--- a/test-notebooks/misc.ipynb
+++ b/test-notebooks/misc.ipynb