diff --git a/pyproject.toml b/pyproject.toml
index 7218e7c..fd2ab1c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,3 +34,15 @@ trials = [
     "plotly>=5.22.0",
     "dash>=2.17.0",
 ]
+
+[tool.ruff]
+line-length = 94
+indent-width = 4
+target-version = "py311"
+
+[tool.ruff.format]
+quote-style = "single"
+skip-magic-trailing-comma = false
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
\ No newline at end of file
diff --git a/scripts/analyse_dataset.py b/scripts/analyse_dataset.py
index bd382a5..2316ede 100644
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@@ -1,33 +1,43 @@
 import typing
+import warnings
+from pathlib import Path
+from typing import cast
 
-from pandas import DataFrame, Series
-
-from ihm_analyse import (
-    SAVE_PATH_FOLDER,
-    PATH_TO_DATASET,
-    THRESHOLD_AMOUNT_CHARACTERS,
-    THRESHOLD_EDGE_WEIGHT,
-    DO_PREPROCESSING,
-    DO_TOKEN_ANALYSIS,
-    DO_GRAPH_POSTPROCESSING,
+from lang_main import (
+    TokenGraph,
     create_saving_folder,
     load_pickle,
-    Embedding,
-    Index,
-    TokenGraph,
 )
-from ihm_analyse.predefined_pipes import (
-    pipe_target_feat,
-    pipe_embds,
+from lang_main.constants import (
+    DO_GRAPH_POSTPROCESSING,
+    DO_PREPROCESSING,
+    DO_TIME_ANALYSIS,
+    DO_TOKEN_ANALYSIS,
+    INPUT_PATH_FOLDER,
+    PATH_TO_DATASET,
+    SAVE_PATH_FOLDER,
+    SKIP_GRAPH_POSTPROCESSING,
+    SKIP_PREPROCESSING,
+    SKIP_TIME_ANALYSIS,
+    SKIP_TOKEN_ANALYSIS,
+    THRESHOLD_AMOUNT_CHARACTERS,
+    THRESHOLD_EDGE_WEIGHT,
+)
+
+# Embedding,
+# PandasIndex,
+from lang_main.pipelines.predefined import (
     pipe_merge,
+    pipe_target_feat,
+    pipe_timeline,
     pipe_token_analysis,
 )
-"""
-# ** config parameters
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
-PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
-"""
+from lang_main.types import (
+    ObjectID,
+    TimelineCandidates,
+)
+from pandas import DataFrame, Series
+
 
 # ** processing pipeline
 def run_preprocessing() -> DataFrame:
@@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
         overwrite_existing=True,
     )
     # run pipelines
-    ret = typing.cast(tuple[DataFrame], 
-                      pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
+    ret = typing.cast(
+        tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
+    )
     target_feat_data = ret[0]
     # only entries with more than threshold amount of characters
-    data_filter = typing.cast(Series,
-                              (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-    subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
-    dupl_idx_pairs, embds = typing.cast(
-        tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
-        pipe_embds.run(starting_values=(subset_data,))
-    )
+    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
+    # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
+    # dupl_idx_pairs, embds = typing.cast(
+    #     tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
+    #     pipe_embds.run(starting_values=(subset_data,)),
+    # )
     # merge duplicates, results saved separately
-    ret = typing.cast(tuple[DataFrame],
-                      pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
+    subset_data = target_feat_data.loc[data_filter].copy()
+    ret = typing.cast(
+        tuple[DataFrame],
+        # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
+        pipe_merge.run(starting_values=(subset_data,)),
+    )
     preprocessed_data = ret[0]
-    
+
     return preprocessed_data
 
+
 def run_token_analysis(
     preprocessed_data: DataFrame,
 ) -> TokenGraph:
     # build token graph
-    (tk_graph,) = typing.cast(tuple[TokenGraph],
-                                 pipe_token_analysis.run(starting_values=(preprocessed_data,)))
+    (tk_graph,) = typing.cast(
+        tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
+    )
     tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
-    tk_graph.to_pickle(SAVE_PATH_FOLDER, 
-                     filename=f'{pipe_token_analysis.name}-TokenGraph')
-    
+    tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
+
     return tk_graph
 
+
 def run_graph_postprocessing(
     tk_graph: TokenGraph,
 ) -> TokenGraph:
     # filter graph by edge weight and remove single nodes (no connection)
     tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
     tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
-    tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
-                                 filename='TokenGraph-filtered',
-                                 directed=False)
-    tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
-                     filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
-    
+    tk_graph_filtered.save_graph(
+        SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
+    )
+    tk_graph_filtered.to_pickle(
+        SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
+    )
+
     return tk_graph_filtered
 
-if __name__ == '__main__':
+
+def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
+    filename = 'without_nan'
+    loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+    verify_path(loading_path)
+    ret = load_pickle(loading_path)
+    preprocessed_data = ret[0]
+
+    ret = cast(
+        tuple[TimelineCandidates, dict[ObjectID, str]],
+        pipe_timeline.run(starting_values=(preprocessed_data,)),
+    )
+    return ret
+
+
+def verify_path(
+    loading_path: Path,
+) -> None:
+    if not loading_path.exists():
+        raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
+
+
+def main() -> None:
+    pre_step_skipped: bool = False
     # ** preprocess
-    if DO_PREPROCESSING:
+    if DO_PREPROCESSING and not SKIP_PREPROCESSING:
         preprocessed_data = run_preprocessing()
-    else:
+    elif not SKIP_PREPROCESSING:
         # !! hardcoded result filenames
         target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
-        target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
-        ret = typing.cast(tuple[DataFrame],
-                          load_pickle(target_filepath))
+        loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
+        verify_path(loading_path)
+        ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
         preprocessed_data = ret[0]
-    # ** token analysis
-    if DO_TOKEN_ANALYSIS:
-        preprocessed_data_trunc = typing.cast(DataFrame, 
-                                            preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
-        tk_graph = run_token_analysis(preprocessed_data_trunc)
     else:
+        pre_step_skipped = True
+        warnings.warn('No preprocessing action selected. Skipped.')
+    # sys.exit(0)
+    # ** token analysis
+    if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
+        if pre_step_skipped:
+            raise RuntimeError(
+                'Preprocessing step skipped. Token analysis cannot be performed.'
+            )
+        preprocessed_data_trunc = typing.cast(
+            DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
+        )  # type: ignore
+        tk_graph = run_token_analysis(preprocessed_data_trunc)
+    elif not SKIP_TOKEN_ANALYSIS:
         # !! hardcoded result filenames
         # whole graph
         filename: str = f'{pipe_token_analysis.name}-TokenGraph'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
-        #tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+        verify_path(loading_path)
+        # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
         tk_graph = TokenGraph.from_pickle(loading_path)
-    # ** graph postprocessing
-    if DO_GRAPH_POSTPROCESSING:
-        tk_graph_filtered = run_graph_postprocessing(tk_graph)
+        pre_step_skipped = False
     else:
+        pre_step_skipped = True
+        warnings.warn('No token analysis action selected. Skipped.')
+    # ** graph postprocessing
+    if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
+        if pre_step_skipped:
+            raise RuntimeError(
+                (
+                    'Preprocessing or token analysis step skipped. '
+                    'Graph postprocessing cannot be performed.'
+                )
+            )
+        tk_graph_filtered = run_graph_postprocessing(tk_graph)
+    elif not SKIP_GRAPH_POSTPROCESSING:
         # !! hardcoded result filenames
         # filtered graph
         filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
-        #tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
-        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
\ No newline at end of file
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+        verify_path(loading_path)
+        # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
+        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
+        pre_step_skipped = False
+    else:
+        warnings.warn('No graph postprocessing action selected. Skipped.')
+    # ** time analysis
+    if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
+        # no check for fails, runs separately
+        ret = run_time_analysis()
+    elif not SKIP_TIME_ANALYSIS:
+        ...
+    else:
+        warnings.warn('No time analysis action selected. Skipped.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/inputs/without_nan.pkl b/scripts/inputs/without_nan.pkl
new file mode 100644
index 0000000..bba1d89
Binary files /dev/null and b/scripts/inputs/without_nan.pkl differ
diff --git a/src/lang_main/config.toml b/scripts/lang_main_config copy.toml
similarity index 65%
rename from src/lang_main/config.toml
rename to scripts/lang_main_config copy.toml
index e5f978f..8cf2829 100644
--- a/src/lang_main/config.toml
+++ b/scripts/lang_main_config copy.toml	
@@ -1,17 +1,21 @@
 # lang_main: Config file
 
 [paths]
-results = './results/test_new2/'
-dataset = './01_2_Rohdaten_neu/Export4.csv'
+inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
+results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
+dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 
 [control]
-preprocessing = false
-token_analysis = true
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = true
 graph_postprocessing = false
+graph_postprocessing_skip = true
 
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
diff --git a/scripts/lang_main_config.toml b/scripts/lang_main_config.toml
new file mode 100644
index 0000000..3d0fdd7
--- /dev/null
+++ b/scripts/lang_main_config.toml
@@ -0,0 +1,59 @@
+# lang_main: Config file
+
+[paths]
+inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
+results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
+dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = true
+token_analysis = false
+token_analysis_skip = true
+graph_postprocessing = false
+graph_postprocessing_skip = true
+time_analysis = true
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+# input_features = [
+#     'VorgangsTypName',
+#     'VorgangsArtText',
+#     'VorgangsBeschreibung',
+# ]
+input_features = [
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
\ No newline at end of file
diff --git a/scripts/test.py b/scripts/test.py
new file mode 100644
index 0000000..8076042
--- /dev/null
+++ b/scripts/test.py
@@ -0,0 +1,12 @@
+from lang_main.analysis.preprocessing import clean_string_slim
+from lang_main.constants import SAVE_PATH_FOLDER
+
+print(SAVE_PATH_FOLDER)
+txt = """
+Wir feiern den Jahrestag, olé!
+tel:::: !!!!???? +++49 123 456 789
+
+Doch leben wir länger.
+"""
+print(txt)
+print(clean_string_slim(txt))
diff --git a/src/lang_main/__init__.py b/src/lang_main/__init__.py
index c6ae768..85a218e 100644
--- a/src/lang_main/__init__.py
+++ b/src/lang_main/__init__.py
@@ -1,18 +1,19 @@
-from typing import Final, Any
 import inspect
-import sys
 import logging
-from time import gmtime
+import shutil
+import sys
 from pathlib import Path
+from time import gmtime
+from typing import Any, Final
 
-from lang_main.shared import (
-    save_pickle, 
-    load_pickle, 
-    create_saving_folder,
-    load_toml_config,
-)
-from lang_main.analysis.preprocessing import Embedding, PandasIndex
 from lang_main.analysis.graphs import TokenGraph
+from lang_main.analysis.preprocessing import Embedding, PandasIndex
+from lang_main.shared import (
+    create_saving_folder,
+    load_pickle,
+    load_toml_config,
+    save_pickle,
+)
 
 __all__ = [
     'save_pickle',
@@ -32,37 +33,30 @@ logging.basicConfig(
     datefmt=LOG_DATE_FMT,
 )
 
-USE_INTERNAL_CONFIG: Final[bool] = True
+CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
+USE_INTERNAL_CONFIG: Final[bool] = False
+
+pkg_dir = Path(__file__).parent
+cfg_path_internal = pkg_dir / CONFIG_FILENAME
 
 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
-    curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
-    pkg_dir = curr_file_dir.parent
-    config_path = Path(pkg_dir, 'config.toml')
-    loaded_config = load_toml_config(path_to_toml=config_path)
-    CONFIG: Final[dict[str, Any]] = loaded_config.copy()
+    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
 else:
-    raise NotImplementedError("External config data not implemented yet.")
+    caller_file = Path(inspect.stack()[-1].filename)
+    if not caller_file.exists():
+        raise FileNotFoundError('Caller file could not be correctly retrieved.')
+    cfg_path_external = caller_file.parent / CONFIG_FILENAME
+    if not cfg_path_external.exists():
+        shutil.copy(cfg_path_internal, cfg_path_external)
+        sys.exit(
+            (
+                'No config file was found. A new one with default values was created '
+                'in the execution path. Please fill in the necessary values and '
+                'restart the programm.'
+            )
+        )
+    # raise NotImplementedError("External config data not implemented yet.")
+    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
 
-# ** paths
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
-PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
-# ** control
-DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
-DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
-DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
-# ** export
-
-# ** preprocessing
-FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
-    CONFIG['preprocess']['filename_cossim_filter_candidates']
-DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
-    CONFIG['preprocess']['threshold_amount_characters']
-THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
-# ** token analysis
-
-# ** graph postprocessing
-THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
-# ** time analysis
-THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
+CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py
index 0c524a2..dd74ebc 100644
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@@ -1,18 +1,18 @@
-import typing
-from typing import Any, Self, Literal, overload, Final
-import sys
-from collections.abc import Hashable
-from pathlib import Path
 import copy
+import sys
+import typing
+from collections.abc import Hashable, Iterable
+from pathlib import Path
+from typing import Any, Final, Literal, Self, overload
 
+import networkx as nx
 import numpy as np
 import numpy.typing as npt
-from networkx import Graph, DiGraph
-import networkx as nx
+from networkx import DiGraph, Graph
 from pandas import DataFrame
 
 from lang_main.loggers import logger_graphs as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle
 
 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False
@@ -31,18 +31,17 @@ def get_graph_metadata(
     min_edge_weight: int = 1_000_000
     max_edge_weight: int = 0
     for edge in graph.edges:
-        weight = typing.cast(int,
-                             graph[edge[0]][edge[1]]['weight'])
+        weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
         if weight < min_edge_weight:
             min_edge_weight = weight
         if weight > max_edge_weight:
             max_edge_weight = weight
-    
+
     # memory
     edge_mem = sum([sys.getsizeof(e) for e in graph.edges])
     node_mem = sum([sys.getsizeof(n) for n in graph.nodes])
     total_mem = edge_mem + node_mem
-    
+
     graph_info.update(
         num_nodes=num_nodes,
         num_edges=num_edges,
@@ -52,20 +51,22 @@ def get_graph_metadata(
         edge_memory=edge_mem,
         total_memory=total_mem,
     )
-    
+
     if logging:
-        logger.info((f"Graph properties: {num_nodes} Nodes, "
-                    f"{num_edges} Edges"))
-        logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
-        logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
-        logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
-    
+        logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
+        logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
+        logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
+        logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
+
     return graph_info
 
+
 def update_graph(
     graph: Graph | DiGraph,
-    parent: Hashable,
-    child: Hashable,
+    *,
+    batch: Iterable[tuple[Hashable, Hashable]] | None = None,
+    parent: Hashable | None = None,
+    child: Hashable | None = None,
     weight_connection: int = 1,
 ) -> None:
     # !! not necessary to check for existence of nodes
@@ -78,7 +79,9 @@ def update_graph(
         graph.add_node(child)
     """
     # check if edge not in Graph
-    if not graph.has_edge(parent, child):
+    if batch is not None:
+        graph.add_edges_from(batch, weight=weight_connection)
+    elif not graph.has_edge(parent, child):
         # create new edge, nodes will be created if not already present
         graph.add_edge(parent, child, weight=weight_connection)
     else:
@@ -87,40 +90,38 @@ def update_graph(
         weight += weight_connection
         graph[parent][child]['weight'] = weight
 
+
 # build undirected adjacency matrix
 def convert_graph_to_undirected(
     graph: DiGraph,
     logging: bool = LOGGING_DEFAULT,
 ) -> Graph:
     # get adjacency matrix
-    adj_mat = typing.cast(DataFrame, 
-                          nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
-    arr = typing.cast(npt.NDArray[np.uint32],
-                      adj_mat.to_numpy())
+    adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
+    arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
     # build undirected array: adding edges of lower triangular matrix to upper one
     arr_upper = np.triu(arr)
     arr_lower = np.tril(arr)
     arr_lower = np.rot90(np.fliplr(arr_lower))
     arr_new = arr_upper + arr_lower
     # assign new data and create graph
-    adj_mat.loc[:] = arr_new # type: ignore
-    graph_undir = typing.cast(Graph,
-                              nx.from_pandas_adjacency(df=adj_mat))
-    
+    adj_mat.loc[:] = arr_new  # type: ignore
+    graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
+
     # info about graph
     if logging:
-        logger.info("Successfully converted graph to one with undirected edges.")
+        logger.info('Successfully converted graph to one with undirected edges.')
     _ = get_graph_metadata(graph=graph_undir, logging=logging)
-    
+
     return graph_undir
 
+
 class TokenGraph(DiGraph):
-    
     def __init__(
         self,
         name: str = 'TokenGraph',
         enable_logging: bool = True,
-        incoming_graph_data: Any| None = None, 
+        incoming_graph_data: Any | None = None,
         **attr,
     ) -> None:
         super().__init__(incoming_graph_data, **attr)
@@ -133,15 +134,17 @@ class TokenGraph(DiGraph):
         self._metadata_directed: dict[str, int] = {}
         self._undirected: Graph | None = None
         self._metadata_undirected: dict[str, int] = {}
-    
+
     def __repr__(self) -> str:
         return self.__str__()
-    
+
     def __str__(self) -> str:
-        return (f"TokenGraph(name: {self.name}, number of nodes: "
-                f"{len(self.nodes)}, number of edges: "
-                f"{len(self.edges)})")
-    
+        return (
+            f'TokenGraph(name: {self.name}, number of nodes: '
+            f'{len(self.nodes)}, number of edges: '
+            f'{len(self.edges)})'
+        )
+
     # !! only used to verify that saving was done correctly
     """
     def __key(self) -> tuple[Hashable, ...]:
@@ -150,7 +153,7 @@ class TokenGraph(DiGraph):
     def __hash__(self) -> int:
         return hash(self.__key())
     """
-    
+
     def copy(self) -> Self:
         """returns a (deep) copy of the graph
 
@@ -160,51 +163,46 @@ class TokenGraph(DiGraph):
             deep copy of the graph
         """
         return copy.deepcopy(self)
-    
+
     @property
     def name(self) -> str:
         return self._name
-    
+
     @property
     def directed(self) -> Self:
         return self._directed
-    
+
     @property
     def undirected(self) -> Graph | None:
         return self._undirected
-    
+
     @property
     def metadata_directed(self) -> dict[str, int]:
         return self._metadata_directed
-    
+
     @property
     def metadata_undirected(self) -> dict[str, int]:
         return self._metadata_undirected
-    
+
     @overload
     def to_undirected(
-        self, 
+        self,
         inplace: Literal[True] = ...,
         logging: bool | None = ...,
-    ) -> None:
-        ...
-    
+    ) -> None: ...
+
     @overload
     def to_undirected(
-        self, 
+        self,
         inplace: Literal[False],
         logging: bool | None = ...,
-    ) -> Graph:
-        ...
-    
+    ) -> Graph: ...
+
     @overload
     def to_undirected(
-        self, 
-        inplace: bool = ..., 
-        logging: bool | None = ...
-    ) -> Graph | None:
-        ...
-    
+        self, inplace: bool = ..., logging: bool | None = ...
+    ) -> Graph | None: ...
+
     def to_undirected(
         self,
         inplace=True,
@@ -212,27 +210,27 @@ class TokenGraph(DiGraph):
     ) -> Graph | None:
         if logging is None:
             logging = self.logging
-        
-        self._undirected = convert_graph_to_undirected(graph=self, 
-                                                       logging=logging)
-        self._metadata_undirected = get_graph_metadata(graph=self._undirected,
-                                                       logging=logging)
+
+        self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
+        self._metadata_undirected = get_graph_metadata(
+            graph=self._undirected, logging=logging
+        )
         if not inplace:
             return self._undirected
-    
+
     def update_metadata(
         self,
         logging: bool | None = None,
     ) -> None:
         if logging is None:
             logging = self.logging
-        
-        self._metadata_directed = get_graph_metadata(graph=self,
-                                                     logging=logging)
+
+        self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
         if self._undirected is not None:
-            self._metadata_undirected = get_graph_metadata(graph=self._undirected,
-                                                           logging=logging)
-    
+            self._metadata_undirected = get_graph_metadata(
+                graph=self._undirected, logging=logging
+            )
+
     def filter_by_edge_weight(
         self,
         threshold: int,
@@ -252,20 +250,19 @@ class TokenGraph(DiGraph):
         # filter edges by weight
         original_graph_edges = copy.deepcopy(self.edges)
         filtered_graph = self.copy()
-        
+
         for edge in original_graph_edges:
-            weight = typing.cast(int,
-                                 filtered_graph[edge[0]][edge[1]]['weight'])
+            weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
             if weight < threshold:
                 filtered_graph.remove_edge(edge[0], edge[1])
-        
+
         if filtered_graph._undirected is not None:
             filtered_graph.to_undirected(inplace=True, logging=False)
-        
+
         filtered_graph.update_metadata(logging=False)
-        
+
         return filtered_graph
-    
+
     def filter_by_node_degree(
         self,
         threshold: int,
@@ -285,31 +282,31 @@ class TokenGraph(DiGraph):
         # filter nodes by degree
         original_graph_nodes = copy.deepcopy(self.nodes)
         filtered_graph = self.copy()
-        
+
         for node in original_graph_nodes:
-            degree = filtered_graph.degree[node] # type: ignore
+            degree = filtered_graph.degree[node]  # type: ignore
             if degree < threshold:
                 filtered_graph.remove_node(node)
-        
+
         if filtered_graph._undirected is not None:
             filtered_graph.to_undirected(inplace=True, logging=False)
-        
+
         filtered_graph.update_metadata(logging=False)
-        
+
         return filtered_graph
-    
+
     def _save_prepare(
         self,
         path: Path,
         filename: str | None = None,
     ) -> Path:
         if filename is not None:
-            saving_path = path.joinpath(f"{filename}")
+            saving_path = path.joinpath(f'{filename}')
         else:
-            saving_path = path.joinpath(f"{self.name}")
-        
+            saving_path = path.joinpath(f'{self.name}')
+
         return saving_path
-    
+
     def save_graph(
         self,
         path: Path,
@@ -335,19 +332,18 @@ class TokenGraph(DiGraph):
             undirected graph should be exported but is not available
         """
         saving_path = self._save_prepare(path=path, filename=filename)
-        
+
         if directed:
             target_graph = self._directed
         elif not directed and self._undirected is not None:
             target_graph = self._undirected
         else:
-            raise ValueError("No undirected graph available.")
-        
+            raise ValueError('No undirected graph available.')
+
         saving_path = saving_path.with_suffix('.graphml')
         nx.write_graphml(G=target_graph, path=saving_path)
-        logger.info(("Successfully saved graph as GraphML file "
-                        f"under {saving_path}."))
-    
+        logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
+
     def to_pickle(
         self,
         path: Path,
@@ -365,7 +361,7 @@ class TokenGraph(DiGraph):
         saving_path = self._save_prepare(path=path, filename=filename)
         saving_path = saving_path.with_suffix('.pickle')
         save_pickle(obj=self, path=saving_path)
-    
+
     @classmethod
     def from_file(
         cls,
@@ -378,15 +374,15 @@ class TokenGraph(DiGraph):
         match path.suffix:
             case '.graphml':
                 graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
-                logger.info(f"Successfully loaded graph from GraphML file {path}.")
+                logger.info(f'Successfully loaded graph from GraphML file {path}.')
             case '.pkl' | '.pickle':
                 graph = typing.cast(Self, load_pickle(path))
-                logger.info(f"Successfully loaded graph from pickle file {path}.")
+                logger.info(f'Successfully loaded graph from pickle file {path}.')
             case _:
-                raise ValueError("File format not supported.")
-        
+                raise ValueError('File format not supported.')
+
         return graph
-    
+
     @classmethod
     def from_pickle(
         cls,
@@ -394,10 +390,10 @@ class TokenGraph(DiGraph):
     ) -> Self:
         if isinstance(path, str):
             path = Path(path)
-        
+
         if path.suffix not in ('.pkl', '.pickle'):
-            raise ValueError("File format not supported.")
-        
+            raise ValueError('File format not supported.')
+
         graph = typing.cast(Self, load_pickle(path))
-        
-        return graph
\ No newline at end of file
+
+        return graph
diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py
index feecfb6..059f6b9 100644
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@@ -1,29 +1,29 @@
-from typing import cast, Callable
+import re
 from collections.abc import Iterable
 from itertools import combinations
-import re
 from math import factorial
 from pathlib import Path
+from typing import Callable, cast
 
 import numpy as np
-from torch import Tensor
-from pandas import DataFrame, Series
 import pandas as pd
-from spacy.lang.de import German as GermanSpacyModel
-from spacy.tokens.doc import Doc as SpacyDoc
-from sentence_transformers import SentenceTransformer
 import sentence_transformers
 import sentence_transformers.util
+from pandas import DataFrame, Series
+from sentence_transformers import SentenceTransformer
+from spacy.lang.de import German as GermanSpacyModel
+from spacy.tokens.doc import Doc as SpacyDoc
+from torch import Tensor
 from tqdm import tqdm
 
-from lang_main.types import Embedding, PandasIndex
-from lang_main.loggers import logger_preprocess as logger
-from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.shared import (
+    candidates_by_index,
     similar_index_connection_graph,
     similar_index_groups,
 )
-#from lang_main.analysis.graphs import update_graph, get_graph_metadata
+from lang_main.loggers import logger_preprocess as logger
+from lang_main.pipelines.base import BasePipeline
+from lang_main.types import Embedding, PandasIndex
 
 
 # ** (1) dataset preparation: loading and simple preprocessing
@@ -45,7 +45,7 @@ def load_raw_data(
     path : str
         path to dataset file, usually CSV file
     date_cols : list[str], optional
-        columns which contain dates and are parsed as such, 
+        columns which contain dates and are parsed as such,
         by default (
             'VorgangsDatum',
             'ErledigungsDatum',
@@ -61,17 +61,22 @@ def load_raw_data(
     # load dataset
     date_cols = list(date_cols)
     data = pd.read_csv(
-        filepath_or_buffer=path, 
-        sep=';', 
-        encoding='cp1252', 
-        parse_dates=date_cols, 
+        filepath_or_buffer=path,
+        sep=';',
+        encoding='cp1252',
+        parse_dates=date_cols,
         dayfirst=True,
     )
-    logger.info("Loaded dataset successfully.")
-    logger.info((f"Dataset properties: number of entries: {len(data)}, "
-                 f"number of features {len(data.columns)}"))
+    logger.info('Loaded dataset successfully.')
+    logger.info(
+        (
+            f'Dataset properties: number of entries: {len(data)}, '
+            f'number of features {len(data.columns)}'
+        )
+    )
     return (data,)
 
+
 def remove_duplicates(
     data: DataFrame,
 ) -> tuple[DataFrame]:
@@ -89,7 +94,7 @@ def remove_duplicates(
     """
     # obtain info about duplicates over all features
     duplicates_filt = data.duplicated()
-    logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
+    logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
     # drop duplicates
     wo_duplicates = data.drop_duplicates(ignore_index=True)
     duplicates_subset: list[str] = [
@@ -97,16 +102,26 @@ def remove_duplicates(
         'ObjektID',
     ]
     duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
-    logger.info(("Number of duplicates over subset " 
-                 f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
-    wo_duplicates =\
-        wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
-    logger.info("Removed all duplicates from dataset successfully.")
-    logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
-                 f"number of features {len(wo_duplicates.columns)}"))
-    
+    logger.info(
+        (
+            'Number of duplicates over subset '
+            f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
+        )
+    )
+    wo_duplicates = wo_duplicates.drop_duplicates(
+        subset=duplicates_subset, ignore_index=True
+    ).copy()
+    logger.info('Removed all duplicates from dataset successfully.')
+    logger.info(
+        (
+            f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
+            f'number of features {len(wo_duplicates.columns)}'
+        )
+    )
+
     return (wo_duplicates,)
 
+
 def remove_NA(
     data: DataFrame,
     target_features: list[str] = [
@@ -127,17 +142,18 @@ def remove_NA(
     DataFrame
         dataset with removed NA entries for given subset of features
     """
-    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
-    logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
-    
+    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
+    logger.info(
+        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
+    )
+
     return (wo_NA,)
 
+
 # ** (2) entry-based cleansing
 # following functions clean and prepare specific entries, not whole dataset
-def clean_string_slim(
-    string: str
-) -> str:
-    """mapping function to clean single string entries in a series (feature-wise) 
+def clean_string_slim(string: str) -> str:
+    """mapping function to clean single string entries in a series (feature-wise)
     of the dataset, used to be applied element-wise for string features
 
     Parameters
@@ -151,13 +167,16 @@ def clean_string_slim(
         cleaned entry
     """
     # remove special chars
-    pattern = r'[\t\n\r\f\v]'
+    pattern = r'[\t\n\r\f\v]+'
     string = re.sub(pattern, ' ', string)
+    pattern = r'([,;.:!?-_\+]){2,}'
     # remove whitespaces at the beginning and the end
+    string = re.sub(pattern, r'\1', string)
     string = string.strip()
-    
+
     return string
 
+
 def entry_wise_cleansing(
     data: DataFrame,
     target_feature: str,
@@ -165,10 +184,16 @@ def entry_wise_cleansing(
 ) -> tuple[DataFrame]:
     # apply given cleansing function to target feature
     data[target_feature] = data[target_feature].map(cleansing_func)
-    logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
-                 f"for feature >>{target_feature}<<"))
+    logger.info(
+        (
+            f'Successfully applied entry-wise cleansing procedure '
+            f'>>{cleansing_func.__name__}<< '
+            f'for feature >>{target_feature}<<'
+        )
+    )
     return (data,)
 
+
 # ** in-depth analysis of one feature
 # following functions try to gain insights on a given feature of the IHM dataset such
 # as number of occurrences or associated Object IDs
@@ -178,15 +203,15 @@ def analyse_feature(
 ) -> tuple[DataFrame]:
     # feature columns
     feature_entries = data[target_feature]
-    logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
+    logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
     # obtain unique entries
     unique_feature_entries = feature_entries.unique()
-    
+
     # prepare result DataFrame
     cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
     result_df = pd.DataFrame(columns=cols)
-    
-    for entry in tqdm(unique_feature_entries, mininterval=1.):
+
+    for entry in tqdm(unique_feature_entries, mininterval=1.0):
         len_entry = len(entry)
         filt = data[target_feature] == entry
         temp = data[filt]
@@ -194,19 +219,16 @@ def analyse_feature(
         assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
         num_assoc_obj_ids = len(assoc_obj_ids)
         num_dupl = filt.sum()
-        
-        conc_df = pd.DataFrame(data=[[
-                                entry,
-                                len_entry,
-                                num_dupl,
-                                assoc_obj_ids,
-                                num_assoc_obj_ids
-                            ]], columns=cols)
-        
+
+        conc_df = pd.DataFrame(
+            data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
+            columns=cols,
+        )
+
         result_df = pd.concat([result_df, conc_df], ignore_index=True)
-    
+
     result_df = result_df.sort_values(by='num_occur', ascending=False).copy()
-    
+
     return (result_df,)
 
 
@@ -223,16 +245,16 @@ def build_embedding_map(
     embeddings: dict[int, tuple[Embedding, str]] = {}
     is_spacy = False
     is_STRF = False
-    
+
     if isinstance(model, GermanSpacyModel):
         is_spacy = True
     elif isinstance(model, SentenceTransformer):
         is_STRF = True
-    
+
     if not any((is_spacy, is_STRF)):
-        raise NotImplementedError("Model type unknown")
-    
-    for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
+        raise NotImplementedError('Model type unknown')
+
+    for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
         # verbose code: Pyright not inferring types correctly
         idx = cast(int, idx)
         text = cast(str, text)
@@ -246,12 +268,17 @@ def build_embedding_map(
                 logger.debug(f'{embd.text=} has no vector')
         elif is_STRF:
             model = cast(SentenceTransformer, model)
-            embd = cast(Tensor, 
-                        model.encode(text, show_progress_bar=False))
+            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
             embeddings[idx] = (embd, text)
-    
+
     return embeddings, (is_spacy, is_STRF)
 
+
+# adapt interface
+# use candidates by index function
+# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
+
+
 # build similarity matrix out of embeddings
 def build_cosSim_matrix(
     data: Series,
@@ -259,30 +286,31 @@ def build_cosSim_matrix(
 ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
     # build empty matrix
     df_index = data.index
-    cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index, 
-                                    index=df_index, dtype=np.float32)
-    
-    logger.info("Start building embedding map...")
-    
+    cosineSim_idx_matrix = pd.DataFrame(
+        data=0.0, columns=df_index, index=df_index, dtype=np.float32
+    )
+
+    logger.info('Start building embedding map...')
+
     # obtain embeddings based on used model
     embds, (is_spacy, is_STRF) = build_embedding_map(
         data=data,
         model=model,
     )
-    
-    logger.info("Embedding map built successfully.")
-    
+
+    logger.info('Embedding map built successfully.')
+
     # apply index based mapping for efficient handling of large texts
     combs = combinations(df_index, 2)
-    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index)-2)
-    
-    logger.info("Start calculation of similarity scores...")
-    
-    for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
-        #print(f"{idx1=}, {idx2=}")
+    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
+
+    logger.info('Start calculation of similarity scores...')
+
+    for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
+        # print(f"{idx1=}, {idx2=}")
         embd1 = embds[idx1][0]
         embd2 = embds[idx2][0]
-        
+
         # calculate similarity based on model type
         if is_spacy:
             embd1 = cast(SpacyDoc, embds[idx1][0])
@@ -293,14 +321,15 @@ def build_cosSim_matrix(
             embd2 = cast(Tensor, embds[idx2][0])
             cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
             cosSim = cast(float, cosSim.item())
-        
+
         cosineSim_idx_matrix.at[idx1, idx2] = cosSim
-    
-    logger.info("Similarity scores calculated successfully.")
-    
+
+    logger.info('Similarity scores calculated successfully.')
+
     return cosineSim_idx_matrix, embds
 
-# obtain index pairs with cosine similarity 
+
+# obtain index pairs with cosine similarity
 # greater than or equal to given threshold value
 def filt_thresh_cosSim_matrix(
     cosineSim_idx_matrix: DataFrame,
@@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
     Series
         series with multi index (index pairs) and corresponding similarity score
     """
-    cosineSim_filt = cast(Series, 
-                          cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
-    
+    cosineSim_filt = cast(
+        Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
+    )
+
     return cosineSim_filt, embds
 
+
 def list_cosSim_dupl_candidates(
     cosineSim_filt: Series,
     embds: dict[int, tuple[Embedding, str]],
@@ -335,7 +366,7 @@ def list_cosSim_dupl_candidates(
     filename: str = 'CosSim-FilterCandidates',
     pipeline: BasePipeline | None = None,
 ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
-    """providing an overview of candidates with a similarity score greater than 
+    """providing an overview of candidates with a similarity score greater than
     given threshold; more suitable for debugging purposes
 
     Returns
@@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
         list containing relevant index pairs for entries with similarity score greater than
         given threshold
     """
-    logger.info("Start gathering of similarity candidates...")
+    logger.info('Start gathering of similarity candidates...')
     # compare found duplicates
     columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
     df_candidates = pd.DataFrame(columns=columns)
-    
+
     index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
 
-    for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
+    for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
         # get text content from embedding as second tuple entry
-        content = [[
-            idx1,
-            embds[idx1][1],
-            idx2,
-            embds[idx2][1],
-            score,
-        ]]
+        content = [
+            [
+                idx1,
+                embds[idx1][1],
+                idx2,
+                embds[idx2][1],
+                score,
+            ]
+        ]
         # add candidates to collection DataFrame
         df_conc = pd.DataFrame(columns=columns, data=content)
         if df_candidates.empty:
@@ -370,25 +403,28 @@ def list_cosSim_dupl_candidates(
             df_candidates = pd.concat([df_candidates, df_conc])
         # save index pairs
         index_pairs.append((idx1, idx2))
-    
-    logger.info("Similarity candidates gathered successfully.")
-    
+
+    logger.info('Similarity candidates gathered successfully.')
+
     if save_candidates:
         if saving_path is None:
-            raise ValueError(("Saving path must be provided if duplicate "
-                          "candidates should be saved."))
+            raise ValueError(
+                ('Saving path must be provided if duplicate ' 'candidates should be saved.')
+            )
         elif pipeline is not None:
-            target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' 
-                            + filename + '.xlsx')
+            target_filename = (
+                f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
+            )
         elif pipeline is None:
             target_filename = f'{filename}.xlsx'
-        logger.info("Saving similarity candidates...")
+        logger.info('Saving similarity candidates...')
         target_path = saving_path.joinpath(target_filename)
         df_candidates.to_excel(target_path)
-        logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
-    
+        logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
+
     return index_pairs, embds
 
+
 # TODO: change implementation fully to SentenceTransformer
 # usage of batch processing for embeddings, use candidate idx function
 # from time analysis --> moved to ``helpers.py``
@@ -419,24 +455,32 @@ def similar_ids_groups(
         yield list(id_group)
 """
 
+
 def merge_similarity_dupl(
     data: DataFrame,
-    similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
+    model: SentenceTransformer,
+    cos_sim_threshold: float,
 ) -> tuple[DataFrame]:
-    logger.info("Start merging of similarity candidates...")
-    
+    logger.info('Start merging of similarity candidates...')
+
     # data
     merged_data = data.copy()
+    model_input = merged_data['entry']
+    candidates_idx = candidates_by_index(
+        data_model_input=model_input,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+    )
     # graph of similar ids
-    similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
-    
+    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
+
     for similar_id_group in similar_index_groups(similar_id_graph):
         similar_id_group = list(similar_id_group)
-        similar_data = merged_data.loc[similar_id_group,:]
-        # keep first entry with max number occurrences, then number of 
+        similar_data = merged_data.loc[similar_id_group, :]
+        # keep first entry with max number occurrences, then number of
         # associated objects, then length of entry
         similar_data = similar_data.sort_values(
-            by=['num_occur', 'num_assoc_obj_ids', 'len'], 
+            by=['num_occur', 'num_assoc_obj_ids', 'len'],
             ascending=[False, False, False],
         )
         # merge information to first entry
@@ -453,11 +497,12 @@ def merge_similarity_dupl(
         # update entry in main dataset, drop remaining entries
         merged_data.update(merged_similar_data)
         merged_data = merged_data.drop(index=similar_id_group)
-    
-    logger.info("Similarity candidates merged successfully.")
-    
+
+    logger.info('Similarity candidates merged successfully.')
+
     return (merged_data.copy(),)
 
+
 # merge duplicates
 def merge_similarity_dupl_old(
     data: DataFrame,
@@ -466,15 +511,14 @@ def merge_similarity_dupl_old(
     # copy pre-cleaned data
     temp = data.copy()
     index = temp.index
-    #logger.info("Start merging of similarity candidates...")
-    
+    # logger.info("Start merging of similarity candidates...")
+
     # iterate over index pairs
-    for (i1, i2) in tqdm(dupl_idx_pairs):
-    
+    for i1, i2 in tqdm(dupl_idx_pairs):
         # if an entry does not exist any more, skip this pair
         if i1 not in index or i2 not in index:
             continue
-        
+
         # merge num occur
         num_occur1 = temp.at[i1, 'num_occur']
         num_occur2 = temp.at[i2, 'num_occur']
@@ -493,13 +537,13 @@ def merge_similarity_dupl_old(
         temp.at[i1, 'num_occur'] = new_num_occur
         temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
         temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
-        
+
         # drop second entry
         temp = temp.drop(index=i2)
         index = temp.index
-    
-    #logger.info("Similarity candidates merged successfully.")
-    
+
+    # logger.info("Similarity candidates merged successfully.")
+
     return (temp,)
 
 
@@ -508,7 +552,7 @@ def choose_cosSim_dupl_candidates(
     cosineSim_filt: Series,
     embds: dict[int, tuple[Embedding, str]],
 ) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
-    """providing an overview of candidates with a similarity score greater than 
+    """providing an overview of candidates with a similarity score greater than
     given threshold, but decision is made manually by iterating through the candidates
     with user interaction; more suitable for debugging purposes
 
@@ -520,15 +564,14 @@ def choose_cosSim_dupl_candidates(
         list containing relevant index pairs for entries with similarity score greater than
         given threshold
     """
-    
-    
+
     # compare found duplicates
     columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
     df_candidates = pd.DataFrame(columns=columns)
-    
+
     index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
 
-    for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
+    for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
         # get texts for comparison
         text1 = embds[idx1][1]
         text2 = embds[idx2][1]
@@ -537,21 +580,23 @@ def choose_cosSim_dupl_candidates(
         print('text1:\n', text1, '\n', flush=True)
         print('text2:\n', text2, '\n', flush=True)
         decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
-        
+
         if not decision == 'y':
             continue
-        
+
         # get text content from embedding as second tuple entry
-        content = [[
-            idx1,
-            text1,
-            idx2,
-            text2,
-            score,
-        ]]
+        content = [
+            [
+                idx1,
+                text1,
+                idx2,
+                text2,
+                score,
+            ]
+        ]
         df_conc = pd.DataFrame(columns=columns, data=content)
-        
+
         df_candidates = pd.concat([df_candidates, df_conc])
         index_pairs.append((idx1, idx2))
-    
-    return df_candidates, index_pairs
\ No newline at end of file
+
+    return df_candidates, index_pairs
diff --git a/src/lang_main/analysis/shared.py b/src/lang_main/analysis/shared.py
index 9165e96..277675b 100644
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@@ -1,11 +1,71 @@
-from typing import cast
 from collections.abc import Iterable, Iterator
+from typing import cast
 
 import networkx as nx
+import numpy as np
+import numpy.typing as npt
+import sentence_transformers
+import sentence_transformers.util
 from networkx import Graph
+from pandas import Series
+from sentence_transformers import SentenceTransformer
+from torch import Tensor
+from tqdm.auto import tqdm
 
+from lang_main.analysis.graphs import get_graph_metadata, update_graph
 from lang_main.types import PandasIndex
-from lang_main.analysis.graphs import update_graph, get_graph_metadata
+
+
+def candidates_by_index(
+    data_model_input: Series,
+    model: SentenceTransformer,
+    cos_sim_threshold: float = 0.5,
+    # ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
+) -> Iterator[tuple[PandasIndex, PandasIndex]]:
+    """function to filter candidate indices based on cosine similarity
+    using SentenceTransformer model in batch mode,
+    feed data as Series to retain information about indices of entries and
+    access them later in the original dataset
+
+    Parameters
+    ----------
+    obj_id : ObjectID
+        _description_
+    data_model_input : Series
+        containing indices and text entries to process
+    model : SentenceTransformer
+        necessary SentenceTransformer model to encode text entries
+    cos_sim_threshold : float, optional
+        threshold for cosine similarity to filter candidates, by default 0.5
+
+    Yields
+    ------
+    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
+        ObjectID and tuple of index pairs which meet the cosine
+        similarity threshold
+    """
+    # embeddings
+    batch = cast(list[str], data_model_input.to_list())
+    embds = cast(
+        Tensor,
+        model.encode(
+            batch,
+            convert_to_numpy=False,
+            convert_to_tensor=True,
+            show_progress_bar=False,
+        ),
+    )
+    # cosine similarity
+    cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
+    np.fill_diagonal(cos_sim, 0.0)
+    cos_sim = np.triu(cos_sim)
+    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
+
+    for idx_array in cos_sim_idx:
+        idx_pair = cast(
+            tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
+        )
+        yield idx_pair
 
 
 def similar_index_connection_graph(
@@ -15,21 +75,21 @@ def similar_index_connection_graph(
     # use this graph to get connected components (indices which belong together)
     # retain semantic connection on whole dataset
     similar_id_graph = nx.Graph()
-    for (idx1, idx2) in similar_idx_pairs:
-        # inplace operation, parent/child do not really exist in undirected graph
-        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
-    
+    # for idx1, idx2 in similar_idx_pairs:
+    #     # inplace operation, parent/child do not really exist in undirected graph
+    #     update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
+    update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
+
     graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
-    
+
     return similar_id_graph, graph_info
 
-# TODO check returning tuple
+
 def similar_index_groups(
     similar_id_graph: Graph,
 ) -> Iterator[tuple[PandasIndex, ...]]:
     # groups of connected indices
-    ids_groups = cast(Iterator[set[PandasIndex]],
-                      nx.connected_components(G=similar_id_graph))
-    
+    ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
+
     for id_group in ids_groups:
-        yield tuple(id_group)
\ No newline at end of file
+        yield tuple(id_group)
diff --git a/src/lang_main/analysis/timeline.py b/src/lang_main/analysis/timeline.py
index 3f67bb4..9d90c7c 100644
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@@ -1,21 +1,17 @@
-from typing import cast
 from collections.abc import Iterable, Iterator
+from typing import cast
 
-import numpy as np
-import numpy.typing as npt
 from pandas import DataFrame, Series
-from torch import Tensor
 from sentence_transformers import SentenceTransformer
-import sentence_transformers
-import sentence_transformers.util
-from tqdm.auto import tqdm # TODO: check deletion
+from tqdm.auto import tqdm  # TODO: check deletion
 
-from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
-from lang_main.loggers import logger_timeline as logger
 from lang_main.analysis.shared import (
+    candidates_by_index,
     similar_index_connection_graph,
     similar_index_groups,
 )
+from lang_main.loggers import logger_timeline as logger
+from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
 
 
 def non_relevant_obj_ids(
@@ -25,35 +21,36 @@ def non_relevant_obj_ids(
     feature_uniqueness: str = 'HObjektText',
     feature_obj_id: str = 'ObjektID',
 ) -> tuple[ObjectID, ...]:
-    
     data = data.copy()
     ids_to_ignore: set[ObjectID] = set()
-    obj_ids = cast(Iterable[ObjectID], # actually NumPy array
-                   data[feature_obj_id].unique())
+    obj_ids = cast(
+        Iterable[ObjectID],  # actually NumPy array
+        data[feature_obj_id].unique(),
+    )
 
     for obj_id in obj_ids:
         feats_per_obj_id = cast(
-            Series,
-            data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
+            Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
         )
         # check for uniqueness of given feature for current ObjectID
         # ignore NaN values
         feats_per_obj_id = feats_per_obj_id.dropna()
         unique_feats_per_obj_id = len(feats_per_obj_id.unique())
-        
+
         if unique_feats_per_obj_id > thresh_unique_feat_per_id:
             ids_to_ignore.add(obj_id)
-    
+
     return tuple(ids_to_ignore)
 
+
 def remove_non_relevant_obj_ids(
     data: DataFrame,
     thresh_unique_feat_per_id: int,
     *,
     feature_uniqueness: str = 'HObjektText',
     feature_obj_id: str = 'ObjektID',
-) -> DataFrame:
-    logger.info("Removing non-relevant ObjectIDs from dataset")
+) -> tuple[DataFrame]:
+    logger.info('Removing non-relevant ObjectIDs from dataset')
     data = data.copy()
     ids_to_ignore = non_relevant_obj_ids(
         data=data,
@@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
     )
     # only retain entries with ObjectIDs not in IDs to ignore
     data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
-    logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
-    logger.info("Non-relevant ObjectIDs removed successfully")
-    
-    return data
+    logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
+    logger.info('Non-relevant ObjectIDs removed successfully')
+
+    return (data,)
 
-def filter_activities_per_obj_id(
-    data: DataFrame,
-    activity_feature: str = 'VorgangsTypName',
-    relevant_activity_types: Iterable[str] = (
-        'Reparaturauftrag (Portal)',
-    ),
-    feature_obj_id: str = 'ObjektID',
-    threshold_num_activities: int = 1,
-) -> tuple[DataFrame, Series]:
-    data = data.copy()
-    # filter only relevant activities count occurrences for each ObjectID
-    logger.info("Filtering activities per ObjectID")
-    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
-    data_filter_activities = data.loc[filt_rel_activities].copy()
-    num_activities_per_obj_id = cast(
-        Series,
-        data_filter_activities[feature_obj_id].value_counts(sort=True)
-    )
-    # filter for ObjectIDs with more than given number of activities
-    filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
-    # index of series contains ObjectIDs
-    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
-    filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
-                                 .isin(obj_ids_below_thresh))
-    
-    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
-    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
-    logger.info("Activities per ObjectID filtered successfully")
-    
-    return data_filter_activities, num_activities_per_obj_id
 
 def generate_model_input(
     data: DataFrame,
@@ -107,8 +74,8 @@ def generate_model_input(
         'VorgangsArtText',
         'VorgangsBeschreibung',
     ),
-) -> DataFrame:
-    logger.info("Generating concatenation of model input features")
+) -> tuple[DataFrame]:
+    logger.info('Generating concatenation of model input features')
     data = data.copy()
     model_input_features = list(model_input_features)
     input_features = data[model_input_features].fillna('').astype(str)
@@ -116,9 +83,40 @@ def generate_model_input(
         lambda x: ' - '.join(x),
         axis=1,
     )
-    logger.info("Model input generated successfully")
-    
-    return data
+    logger.info('Model input generated successfully')
+
+    return (data,)
+
+
+def filter_activities_per_obj_id(
+    data: DataFrame,
+    activity_feature: str = 'VorgangsTypName',
+    relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
+    feature_obj_id: str = 'ObjektID',
+    threshold_num_activities: int = 1,
+) -> tuple[DataFrame, Series]:
+    data = data.copy()
+    # filter only relevant activities count occurrences for each ObjectID
+    logger.info('Filtering activities per ObjectID')
+    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
+    data_filter_activities = data.loc[filt_rel_activities].copy()
+    num_activities_per_obj_id = cast(
+        Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
+    )
+    # filter for ObjectIDs with more than given number of activities
+    filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
+    # index of series contains ObjectIDs
+    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
+    filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
+        obj_ids_below_thresh
+    )
+
+    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
+    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
+    logger.info('Activities per ObjectID filtered successfully')
+
+    return data_filter_activities, num_activities_per_obj_id
+
 
 # for each obj_id in relevant_obj_ids
 ## filter data for obj_id
@@ -130,6 +128,7 @@ def generate_model_input(
 ## obtain idx pairs, yield
 ## use idx pairs to get idx values of series
 
+
 def get_timeline_candidates_index(
     data: DataFrame,
     num_activities_per_obj_id: Series,
@@ -140,92 +139,33 @@ def get_timeline_candidates_index(
     model_input_feature: str = 'nlp_model_input',
 ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
     # already sorted ObjIDs (descending regarding number of activities)
-    obj_ids = cast(Iterable[ObjectID],
-                   num_activities_per_obj_id.index)
-    
+    obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
+
     for obj_id in tqdm(obj_ids):
-        data_per_obj_id = cast(
-            DataFrame,
-            data.loc[data[feature_obj_id]==obj_id]
-        )
+        data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
         data_model_input = data_per_obj_id[model_input_feature]
-        
+
         candidates_idx = candidates_by_index(
             data_model_input=data_model_input,
             model=model,
             cos_sim_threshold=cos_sim_threshold,
         )
         # directly process candidates
-        candidates_idx = tuple(candidates_idx)
+        # candidates_idx = tuple(candidates_idx)
         similar_id_graph, _ = similar_index_connection_graph(
             similar_idx_pairs=candidates_idx,
         )
-        
+
         for index_group in similar_index_groups(similar_id_graph):
             yield obj_id, index_group
 
+
 # TODO: check application for duplicate removal
-def candidates_by_index(
-    data_model_input: Series,
-    model: SentenceTransformer,
-    cos_sim_threshold: float = 0.5,
-) -> Iterator[tuple[PandasIndex, PandasIndex]]:
-    """function to filter candidate indices based on cosine similarity
-    using SentenceTransformer model in batch mode,
-    feed data as Series to retain information about indices of entries and
-    access them later in the original dataset
-
-    Parameters
-    ----------
-    obj_id : ObjectID
-        _description_
-    data_model_input : Series
-        containing indices and text entries to process
-    model : SentenceTransformer
-        necessary SentenceTransformer model to encode text entries
-    cos_sim_threshold : float, optional
-        threshold for cosine similarity to filter candidates, by default 0.5
-
-    Yields
-    ------
-    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
-        ObjectID and tuple of index pairs which meet the cosine 
-        similarity threshold
-    """
-    # embeddings
-    batch = cast(list[str],
-                 data_model_input.to_list())
-    embds = cast(
-        Tensor,
-        model.encode(
-            batch, 
-            convert_to_numpy=False,
-            convert_to_tensor=True,
-            show_progress_bar=False,
-        )
-    )
-    # cosine similarity
-    cos_sim = cast(
-        npt.NDArray,
-        sentence_transformers.util.cos_sim(embds, embds).numpy()
-    )
-    np.fill_diagonal(cos_sim, 0.)
-    cos_sim = np.triu(cos_sim)
-    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
-    
-    for idx_array in cos_sim_idx:
-        idx_pair = cast(
-            tuple[np.int64, np.int64],
-            tuple(data_model_input.index[idx] for idx in idx_array)
-        )
-        yield idx_pair
-
-
 def transform_timeline_candidates(
     candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
 ) -> TimelineCandidates:
     """function to build a mapping of ObjectIDs to their respective collection of
-    timeline candidates (as tuple), each candidate group is separated as distinct 
+    timeline candidates (as tuple), each candidate group is separated as distinct
     tuple within this outer tuple
 
     Parameters
@@ -238,12 +178,12 @@ def transform_timeline_candidates(
     dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
         dictionary: ObjectID -> tuple of candidate groups
     """
-    
+
     candidates_by_obj_id: TimelineCandidates = {}
-    
+
     obj_id_target: ObjectID | None = None
     collection: list[tuple[PandasIndex, ...]] = []
-    
+
     for obj_id, cands in candidates:
         if obj_id_target is None:
             collection = []
@@ -253,26 +193,58 @@ def transform_timeline_candidates(
             collection = []
             obj_id_target = obj_id
         collection.append(cands)
-    
+
     if collection and obj_id_target is not None:
         candidates_by_obj_id[obj_id_target] = tuple(collection)
-    
+
     return candidates_by_obj_id
 
-def map_obj_texts(
+
+def map_obj_id_to_texts(
     data: DataFrame,
-    obj_ids: Iterable[ObjectID],
+    feature_obj_id: str = 'ObjektID',
 ) -> dict[ObjectID, str]:
+    data = data.copy()
+    obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
+
     obj_id_to_text: dict[ObjectID, str] = {}
-    
-    for obj_id in obj_ids:
-        data_per_obj = cast(
-            DataFrame,
-            data.loc[data['ObjektID']==obj_id]
-        )
+
+    for obj_id in tqdm(obj_ids):
+        data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
         # just take first entry
         obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
         obj_text = obj_text.strip(r' ,.:')
         obj_id_to_text[obj_id] = obj_text
-    
-    return obj_id_to_text
\ No newline at end of file
+
+    return obj_id_to_text
+
+
+def get_timeline_candidates(
+    data: DataFrame,
+    num_activities_per_obj_id: Series,
+    *,
+    model: SentenceTransformer,
+    cos_sim_threshold: float,
+    feature_obj_id: str = 'ObjektID',
+    model_input_feature: str = 'nlp_model_input',
+) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
+    logger.info('Obtaining timeline candidates...')
+    candidates = get_timeline_candidates_index(
+        data=data,
+        num_activities_per_obj_id=num_activities_per_obj_id,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+        feature_obj_id=feature_obj_id,
+        model_input_feature=model_input_feature,
+    )
+    tl_candidates = transform_timeline_candidates(candidates)
+    logger.info('Timeline candidates obtained successfully.')
+    # text mapping to obtain object descriptors
+    logger.info('Mapping ObjectIDs to their respective text descriptor...')
+    map_obj_text = map_obj_id_to_texts(
+        data=data,
+        feature_obj_id=feature_obj_id,
+    )
+    logger.info('ObjectIDs successfully mapped to text descriptors.')
+
+    return tl_candidates, map_obj_text
diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py
index 02c05e9..cf4efb2 100644
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@@ -1,56 +1,56 @@
-from typing import cast
 import re
-from itertools import combinations
 from collections.abc import Iterator
+from itertools import combinations
+from typing import cast
 
 from dateutil.parser import parse
-from spacy.tokens.token import Token as SpacyToken
-from spacy.tokens.doc import Doc as SpacyDoc
-from spacy.lang.de import German as GermanSpacyModel
 from pandas import DataFrame
+from spacy.lang.de import German as GermanSpacyModel
+from spacy.tokens.doc import Doc as SpacyDoc
+from spacy.tokens.token import Token as SpacyToken
 from tqdm.auto import tqdm
 
-from lang_main.loggers import logger_token_analysis as logger
 from lang_main.analysis.graphs import (
-    update_graph,
     TokenGraph,
+    update_graph,
 )
-
+from lang_main.loggers import logger_token_analysis as logger
 
 # ** Logging
-#LOGGING_LEVEL = 'INFO'
-#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-#logger = logging.getLogger('ihm_analyse.token_analysis')
+# LOGGING_LEVEL = 'INFO'
+# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
+# logger = logging.getLogger('ihm_analyse.token_analysis')
 
 # ** POS
-#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
-#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
-#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
 POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
 
-#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
+# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
 POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
 
 # ** TAG
-#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
+# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
 TAG_OF_INTEREST: frozenset[str] = frozenset()
 
 
 # ** obtaining connection in texts
 
+
 def pre_clean_word(string: str) -> str:
-    
     pattern = r'[^A-Za-zäöüÄÖÜ]+'
     string = re.sub(pattern, '', string)
-    
+
     return string
 
-# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format 
+
+# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
 def is_str_date(
-    string: str, 
+    string: str,
     fuzzy: bool = False,
 ) -> bool:
-    #print(string)
+    # print(string)
     try:
         # check if string is a number
         # if length is greater than 8, it is not a date
@@ -60,33 +60,38 @@ def is_str_date(
     except ValueError:
         # not a number
         pass
-    
+
     try:
         parse(string, fuzzy=fuzzy)
         return True
     except ValueError:
         return False
 
+
 def obtain_relevant_descendants(
-    token: SpacyToken, 
+    token: SpacyToken,
 ) -> Iterator[SpacyToken]:
-    
     for descendant in token.subtree:
         # subtrees contain the token itself
         # if current element is token skip this element
         if descendant == token:
             continue
-        
+
         # if descendant is a date skip it)
         if is_str_date(string=descendant.text):
             continue
-        
-        logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
-                      f">>{descendant}<<, POS >>{descendant.pos_}<<"))
-        
+
+        logger.debug(
+            (
+                f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
+                f'>>{descendant}<<, POS >>{descendant.pos_}<<'
+            )
+        )
+
         # eliminate cases of cross-references with verbs
-        if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
-            (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
+        if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
+            descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
+        ):
             continue
         # skip cases in which descendant is indirect POS with others than verbs
         elif descendant.pos_ in POS_INDIRECT:
@@ -94,11 +99,12 @@ def obtain_relevant_descendants(
         # skip cases in which child has no relevant POS or TAG
         elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
             continue
-        
+
         yield descendant
-        
+
         # TODO look at results and fine-tune function accordingly
 
+
 def add_doc_info_to_graph(
     graph: TokenGraph,
     doc: SpacyDoc,
@@ -114,7 +120,7 @@ def add_doc_info_to_graph(
             # skip token which are dates or times
             if is_str_date(string=token.text):
                 continue
-            
+
             relevant_descendants = obtain_relevant_descendants(token=token)
             # for non-AUX: add parent <--> descendant pair to graph
             if token.pos_ not in POS_INDIRECT:
@@ -124,13 +130,13 @@ def add_doc_info_to_graph(
                         graph=graph,
                         parent=token.lemma_,
                         child=descendant.lemma_,
-                        weight_connection=weight
+                        weight_connection=weight,
                     )
             else:
                 # if indirect POS, make connection between all associated words
                 combs = combinations(relevant_descendants, r=2)
                 for comb in combs:
-                    # !! parents and children do not really exist in this case, 
+                    # !! parents and children do not really exist in this case,
                     # !! but only one connection is made
                     update_graph(
                         graph=graph,
@@ -139,32 +145,33 @@ def add_doc_info_to_graph(
                         weight_connection=weight,
                     )
 
+
 def build_token_graph(
     data: DataFrame,
     model: GermanSpacyModel,
 ) -> tuple[TokenGraph]:
     # empty NetworkX directed graph
-    #graph = nx.DiGraph()
+    # graph = nx.DiGraph()
     graph = TokenGraph()
-    
+
     for row in tqdm(data.itertuples(), total=len(data)):
         # obtain properties from tuple
         # attribute names must match with preprocessed data
         entry_text = cast(str, row.entry)
         weight = cast(int, row.num_occur)
-        
+
         # get spacy model output
         doc = model(entry_text)
-        
+
         add_doc_info_to_graph(
             graph=graph,
             doc=doc,
             weight=weight,
         )
-    
+
     # metadata
     graph.update_metadata()
     # convert to undirected
     graph.to_undirected()
-    
-    return (graph,)
\ No newline at end of file
+
+    return (graph,)
diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py
new file mode 100644
index 0000000..c60439f
--- /dev/null
+++ b/src/lang_main/constants.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+from typing import Final
+
+from lang_main import CONFIG
+
+# ** paths
+INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
+SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
+PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
+# ** control
+DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
+SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
+DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
+SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
+DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
+SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
+DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
+SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
+# ** export
+
+# ** preprocessing
+FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
+    'filename_cossim_filter_candidates'
+]
+DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
+THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
+    'threshold_amount_characters'
+]
+THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
+# ** token analysis
+
+# ** graph postprocessing
+THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
+# ** time analysis.uniqueness
+THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
+    'threshold_unique_texts'
+]
+UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
+    'criterion_feature'
+]
+FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
+# ** time_analysis.model_input
+MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
+    CONFIG['time_analysis']['model_input']['input_features']
+)
+ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
+ACTIVITY_TYPES: Final[tuple[str]] = tuple(
+    CONFIG['time_analysis']['model_input']['activity_types']
+)
+THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
+    'threshold_num_acitivities'
+]
+THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
+    'threshold_similarity'
+]
diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml
new file mode 100644
index 0000000..c694e25
--- /dev/null
+++ b/src/lang_main/lang_main_config.toml
@@ -0,0 +1,56 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+results = './results/test_new2/'
+dataset = './01_2_Rohdaten_neu/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = false
+graph_postprocessing = false
+graph_postprocessing_skip = false
+time_analysis = false
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+input_features = [
+    'VorgangsTypName',
+    'VorgangsArtText',
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
\ No newline at end of file
diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py
index f33302e..eadbb4d 100644
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@@ -1,5 +1,5 @@
-from typing import Final
 import logging
+from typing import Final
 
 from lang_main.types import LoggingLevels
 
diff --git a/src/lang_main/pipelines/base.py b/src/lang_main/pipelines/base.py
index 8273c4b..ad78589 100644
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@@ -1,20 +1,18 @@
-from typing import Any
-#from types import FunctionType
-import sys
-import logging
 from collections.abc import Callable
 from pathlib import Path
+from typing import Any
 
 from lang_main.loggers import logger_pipelines as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle
 
 # ** pipelines to perform given actions on dataset in a customisable manner
 
+
 class NoPerformableActionError(Exception):
     """Error describing that no action is available in the current pipeline"""
 
-class BasePipeline():
-    
+
+class BasePipeline:
     def __init__(
         self,
         name: str,
@@ -22,12 +20,14 @@ class BasePipeline():
     ) -> None:
         # init base class
         super().__init__()
-        
+
         # name of pipeline
         self.name = name
         # working directory for pipeline == output path
         self.working_dir = working_dir
-        
+        # if not self.working_dir.exists():
+        #     self.working_dir.mkdir(parents=True)
+
         # container for actions to perform during pass
         self.actions: list[Callable] = []
         self.action_names: list[str] = []
@@ -37,15 +37,17 @@ class BasePipeline():
         self.curr_proc_idx: int = 1
         # intermediate result
         self._intermediate_result: Any | None = None
-        
+
     def __repr__(self) -> str:
-        return (f"{self.__class__.__name__}(name: {self.name}, "
-                f"working dir: {self.working_dir}, contents: {self.action_names})")
-    
+        return (
+            f'{self.__class__.__name__}(name: {self.name}, '
+            f'working dir: {self.working_dir}, contents: {self.action_names})'
+        )
+
     @property
     def intermediate_result(self) -> Any:
         return self._intermediate_result
-    
+
     def add(
         self,
         action: Callable,
@@ -53,16 +55,17 @@ class BasePipeline():
         save_result: bool = False,
     ) -> None:
         # check explicitly for function type
-        #if isinstance(action, FunctionType):
+        # if isinstance(action, FunctionType):
         if isinstance(action, Callable):
             self.actions.append(action)
             self.action_names.append(action.__name__)
             self.actions_kwargs.append(action_kwargs.copy())
             self.is_save_result.append(save_result)
         else:
-            raise TypeError(("Action must be custom function, "
-                             f"but is of type >>{type(action)}<<."))
-    
+            raise TypeError(
+                f'Action must be custom function, but is of type >>{type(action)}<<.'
+            )
+
     # TODO: add multiple entries by utilising simple add method
     """
     def add_multi(
@@ -84,7 +87,7 @@ class BasePipeline():
             raise TypeError(("Action must be function or sequence of functions, "
                              f"but is of type >>{type(action)}<<."))
     """
-    
+
     def save_curr_result(
         self,
         filename: str,
@@ -94,7 +97,7 @@ class BasePipeline():
         target_path = target_path.with_suffix('.pkl')
         # saving file locally
         save_pickle(obj=self._intermediate_result, path=target_path)
-    
+
     def load_intermediate_result(
         self,
         saving_path: str,
@@ -103,25 +106,26 @@ class BasePipeline():
         target_path = Path(saving_path + filename).with_suffix('.pkl')
         # loading DataFrame or Series from pickle
         data = load_pickle(target_path)
-        
+
         return data
-    
+
     def prep_run(self) -> None:
-        logger.info(f"Starting processing pipeline >>{self.name}<<...")
+        logger.info(f'Starting processing pipeline >>{self.name}<<...')
         # progress tracking
         self.curr_proc_idx = 1
         # check if performable actions available
         if len(self.actions) == 0:
-            raise NoPerformableActionError(("The pipeline does not contain any "
-                                           "performable actions."))
-    
+            raise NoPerformableActionError(
+                ('The pipeline does not contain any ' 'performable actions.')
+            )
+
     def run(
         self,
         starting_values: tuple[Any, ...],
     ) -> tuple[Any, ...]:
         # prepare start
         self.prep_run()
-        
+
         for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
             if idx == 0:
                 ret = action(*starting_values, **action_kwargs)
@@ -134,7 +138,7 @@ class BasePipeline():
                 self.save_curr_result(filename=self.action_names[idx])
             # processing tracking
             self.curr_proc_idx += 1
-        
-        logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
-        
-        return ret
\ No newline at end of file
+
+        logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
+
+        return ret
diff --git a/src/lang_main/pipelines/predefined.py b/src/lang_main/pipelines/predefined.py
index e440646..ea168dc 100644
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@@ -1,57 +1,144 @@
-from sentence_transformers import SentenceTransformer
 import spacy
+from sentence_transformers import SentenceTransformer
 
-from lang_main import (
-    SAVE_PATH_FOLDER,
-    DATE_COLS,
-    FILENAME_COSSIM_FILTER_CANDIDATES,
-    THRESHOLD_SIMILARITY,
-)
-from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.preprocessing import (
-    load_raw_data,
-    remove_duplicates,
-    remove_NA,
+    analyse_feature,
     clean_string_slim,
     entry_wise_cleansing,
-    analyse_feature,
-    build_cosSim_matrix,
-    filt_thresh_cosSim_matrix,
-    list_cosSim_dupl_candidates,
+    load_raw_data,
     merge_similarity_dupl,
+    remove_duplicates,
+    remove_NA,
+)
+from lang_main.analysis.timeline import (
+    filter_activities_per_obj_id,
+    generate_model_input,
+    get_timeline_candidates,
+    remove_non_relevant_obj_ids,
 )
 from lang_main.analysis.tokens import build_token_graph
+from lang_main.constants import (
+    ACTIVITY_FEATURE,
+    ACTIVITY_TYPES,
+    DATE_COLS,
+    FEATURE_NAME_OBJ_ID,
+    MODEL_INPUT_FEATURES,
+    SAVE_PATH_FOLDER,
+    THRESHOLD_NUM_ACTIVITIES,
+    THRESHOLD_SIMILARITY,
+    THRESHOLD_TIMELINE_SIMILARITY,
+    THRESHOLD_UNIQUE_TEXTS,
+    UNIQUE_CRITERION_FEATURE,
+)
+from lang_main.pipelines.base import BasePipeline
 
 # ** pipeline configuration
 # ** target feature preparation
 pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
-pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
+pipe_target_feat.add(
+    load_raw_data,
+    {
+        'date_cols': DATE_COLS,
+    },
+)
 pipe_target_feat.add(remove_duplicates)
 pipe_target_feat.add(remove_NA, save_result=True)
-pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
-pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
+pipe_target_feat.add(
+    entry_wise_cleansing,
+    {
+        'target_feature': 'VorgangsBeschreibung',
+        'cleansing_func': clean_string_slim,
+    },
+)
+pipe_target_feat.add(
+    analyse_feature,
+    {
+        'target_feature': 'VorgangsBeschreibung',
+    },
+    save_result=True,
+)
 # output: DataFrame containing target feature with
 # number of occurrences and associated ObjectIDs
 
 # ** embedding pipe
+# ?? still needed?
 # using similarity between entries to catch duplicates with typo or similar content
-pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
+# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
 model_spacy = spacy.load('de_dep_news_trf')
 model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
 
-pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
-pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
-pipe_embds.add(
-    list_cosSim_dupl_candidates, 
-    {'save_candidates': True, 
-     'saving_path': SAVE_PATH_FOLDER,
-     'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
-     'pipeline': pipe_embds}, save_result=True)
+# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
+# pipe_embds.add(
+#     filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
+# )
+# pipe_embds.add(
+#     list_cosSim_dupl_candidates,
+#     {
+#         'save_candidates': True,
+#         'saving_path': SAVE_PATH_FOLDER,
+#         'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
+#         'pipeline': pipe_embds,
+#     },
+#     save_result=True,
+# )
 
 # ** Merge duplicates
 pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
-pipe_merge.add(merge_similarity_dupl, save_result=True)
+# pipe_merge.add(merge_similarity_dupl, save_result=True)
+pipe_merge.add(
+    merge_similarity_dupl,
+    {
+        'model': model_stfr,
+        'cos_sim_threshold': THRESHOLD_SIMILARITY,
+    },
+    save_result=True,
+)
 
 # ** token analysis
 pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
-pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
+pipe_token_analysis.add(
+    build_token_graph,
+    {
+        'model': model_spacy,
+    },
+    save_result=True,
+)
+
+
+# ** timeline analysis
+pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
+pipe_timeline.add(
+    remove_non_relevant_obj_ids,
+    {
+        'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
+        'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+    },
+    save_result=True,
+)
+pipe_timeline.add(
+    generate_model_input,
+    {
+        'target_feature_name': 'nlp_model_input',
+        'model_input_features': MODEL_INPUT_FEATURES,
+    },
+)
+pipe_timeline.add(
+    filter_activities_per_obj_id,
+    {
+        'activity_feature': ACTIVITY_FEATURE,
+        'relevant_activity_types': ACTIVITY_TYPES,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+        'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
+    },
+)
+pipe_timeline.add(
+    get_timeline_candidates,
+    {
+        'model': model_stfr,
+        'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+        'model_input_feature': 'nlp_model_input',
+    },
+    save_result=True,
+)
diff --git a/src/lang_main/shared.py b/src/lang_main/shared.py
index e54286b..e44139f 100644
--- a/src/lang_main/shared.py
+++ b/src/lang_main/shared.py
@@ -1,56 +1,67 @@
-from typing import Any
 import os
-import shutil
 import pickle
+import shutil
 import tomllib
 from pathlib import Path
+from typing import Any
 
 from lang_main.loggers import logger_shared_helpers as logger
 
+
 # ** Lib
 def create_saving_folder(
     saving_path_folder: str | Path,
     overwrite_existing: bool = False,
 ) -> None:
     # check for existence of given path
-    if not os.path.exists(saving_path_folder):
-        os.makedirs(saving_path_folder)
+    if isinstance(saving_path_folder, str):
+        saving_path_folder = Path(saving_path_folder)
+    if not saving_path_folder.exists():
+        saving_path_folder.mkdir(parents=True)
     else:
         if overwrite_existing:
             # overwrite if desired (deletes whole path and re-creates it)
             shutil.rmtree(saving_path_folder)
             os.makedirs(saving_path_folder)
         else:
-            logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
-                         "unchanged. If you want to overwrite this path, use parameter "
-                         ">>overwrite_existing<<."))
+            logger.info(
+                (
+                    f'Path >>{saving_path_folder}<< already exists and remained '
+                    f'unchanged. If you want to overwrite this path, use parameter '
+                    f'>>overwrite_existing<<.'
+                )
+            )
+
 
 def load_toml_config(
     path_to_toml: str | Path,
 ) -> dict[str, Any]:
-    with open(path_to_toml, "rb") as f:
+    with open(path_to_toml, 'rb') as f:
         data = tomllib.load(f)
-    logger.info("Loaded TOML config file successfully.")
+    logger.info('Loaded TOML config file successfully.')
     return data
 
+
 # saving and loading using pickle
 # careful: pickling from unknown sources can be dangerous
 def save_pickle(
-    obj: Any, 
+    obj: Any,
     path: str | Path,
 ) -> None:
     with open(path, 'wb') as file:
         pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
-    logger.info(f"Saved file successfully under {path}")
+    logger.info(f'Saved file successfully under {path}')
+
 
 def load_pickle(
     path: str | Path,
 ) -> Any:
     with open(path, 'rb') as file:
         obj = pickle.load(file)
-    logger.info("Loaded file successfully.")
+    logger.info('Loaded file successfully.')
     return obj
 
+
 # TODO: remove, too specialised for common application
 """
 def filter_candidates_idx(
@@ -103,4 +114,4 @@ def filter_candidates_idx(
             tuple(data_model_input.index[idx] for idx in idx_array)
         )
         yield idx_pair
-"""
\ No newline at end of file
+"""
diff --git a/src/lang_main/types.py b/src/lang_main/types.py
index 85d032a..a635987 100644
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@@ -1,4 +1,4 @@
-from typing import TypeAlias, Literal
+from typing import Literal, TypeAlias
 
 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
@@ -6,7 +6,7 @@ from torch import Tensor
 
 LoggingLevels: TypeAlias = Literal[
     'DEBUG',
-    'INFO', 
+    'INFO',
     'WARNING',
     'ERROR',
     'CRITICAL',
@@ -16,4 +16,4 @@ PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
 Embedding: TypeAlias = SpacyDoc | Tensor
 
-TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
\ No newline at end of file
+TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
diff --git a/test-notebooks/Preprocess_Pipeline.ipynb b/test-notebooks/Preprocess_Pipeline.ipynb
index 6525b52..c6c6626 100644
--- a/test-notebooks/Preprocess_Pipeline.ipynb
+++ b/test-notebooks/Preprocess_Pipeline.ipynb
@@ -13,29 +13,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'ihm_analyse'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m      3\u001b[0m     load_raw_data,\n\u001b[0;32m      4\u001b[0m     remove_duplicates,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m     merge_similarity_dupl,\n\u001b[0;32m     13\u001b[0m )\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
      ]
     }
    ],
    "source": [
-    "from ihm_analyse import CONFIG\n",
-    "from ihm_analyse.lib.preprocess import (\n",
+    "from lang_main import CONFIG\n",
+    "from lang_main.lib.preprocess import (\n",
     "    load_raw_data,\n",
     "    remove_duplicates,\n",
     "    remove_NA,\n",
@@ -47,8 +43,8 @@
     "    list_cosSim_dupl_candidates,\n",
     "    merge_similarity_dupl,\n",
     ")\n",
-    "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
-    "from ihm_analyse.lib.helpers import (\n",
+    "from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
+    "from lang_main.lib.helpers import (\n",
     "    save_pickle, \n",
     "    load_pickle, \n",
     "    create_saving_folder,\n",
diff --git a/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
new file mode 100644
index 0000000..bba1d89
Binary files /dev/null and b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl differ
diff --git a/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
new file mode 100644
index 0000000..5565194
Binary files /dev/null and b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl differ
diff --git a/test-notebooks/dashboard/app.py b/test-notebooks/dashboard/app.py
index c190b0d..29689d6 100644
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@@ -1,28 +1,42 @@
 from typing import cast
+from pathlib import Path
 
+import pandas as pd
+import plotly.express as px
 from dash import (
     Dash,
-    html,
-    dcc,
-    callback,
-    Output,
     Input,
+    Output,
     State,
+    callback,
     dash_table,
+    dcc,
+    html,
 )
-import plotly.express as px
-import pandas as pd
+from lang_main import load_pickle
+from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame
 
-from lang_main import load_pickle
-from lang_main.types import TimelineCandidates, ObjectID
-
-#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
+# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
 
 # ** data
-data = cast(DataFrame, load_pickle('./data.pkl'))
-cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
-texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
+p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
+p_tl = Path(
+    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
+)
+ret = cast(DataFrame, load_pickle(p_df))
+data = ret[0]
+ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
+cands = ret[0]
+texts = ret[1]
+
+# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
+# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
+# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
+# data = cast(DataFrame, load_pickle(p_df))
+# cands = cast(TimelineCandidates, load_pickle(p_cands))
+# texts = cast(dict[ObjectID, str], load_pickle(p_map))
+
 table_feats = [
     'ErstellungsDatum',
     'ErledigungsDatum',
@@ -52,25 +66,28 @@ hover_data = {
 app = Dash(prevent_initial_callbacks=True)
 
 app.layout = [
-    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
-    html.Div(children=[
-        html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
-        dcc.Dropdown(
-            list(cands.keys()),
-            id='dropdown-selection',
-            placeholder="ObjektID auswählen...",
-        )
-    ]),
-    html.Div(children=[
-        html.H3(id='object_text'),
-        dcc.Dropdown(id='choice-candidates'),
-        dcc.Graph(id='graph-output'),
-    ]),
-    html.Div(children=[
-        dash_table.DataTable(id='table-candidates')
-    ]),
+    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
+    html.Div(
+        children=[
+            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
+            dcc.Dropdown(
+                list(cands.keys()),
+                id='dropdown-selection',
+                placeholder='ObjektID auswählen...',
+            ),
+        ]
+    ),
+    html.Div(
+        children=[
+            html.H3(id='object_text'),
+            dcc.Dropdown(id='choice-candidates'),
+            dcc.Graph(id='graph-output'),
+        ]
+    ),
+    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
 ]
 
+
 @callback(
     Output('object_text', 'children'),
     Input('dropdown-selection', 'value'),
@@ -82,6 +99,7 @@ def update_obj_text(obj_id):
     headline = f'HObjektText: {obj_text}'
     return headline
 
+
 @callback(
     Output('choice-candidates', 'options'),
     Input('dropdown-selection', 'value'),
@@ -90,9 +108,10 @@ def update_obj_text(obj_id):
 def update_choice_candidates(obj_id):
     obj_id = int(obj_id)
     cands_obj_id = cands[obj_id]
-    choices = list(range(1, len(cands_obj_id)+1))
+    choices = list(range(1, len(cands_obj_id) + 1))
     return choices
 
+
 @callback(
     Output('graph-output', 'figure'),
     Input('choice-candidates', 'value'),
@@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
     title = f'HObjektText: {obj_text}'
     # cands
     cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index)-1]
+    cands_choice = cands_obj_id[int(index) - 1]
     # data
     df = data.loc[list(cands_choice)].sort_index()
     # figure
@@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
         title=title,
         hover_data=hover_data,
     )
-    fig.update_traces(
-        mode='markers+lines',
-        marker=markers,
-        marker_symbol='diamond'
-    )
+    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
     fig.update_xaxes(
-        tickformat="%B\n%Y",
+        tickformat='%B\n%Y',
         rangeslider_visible=True,
     )
     fig.update_yaxes(type='category')
-    fig.update_layout(hovermode="x unified")
+    fig.update_layout(hovermode='x unified')
     return fig
 
+
 @callback(
-    [Output('table-candidates', 'data'),
-     Output('table-candidates', 'columns')],
+    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
     Input('choice-candidates', 'value'),
     State('dropdown-selection', 'value'),
     prevent_initial_call=True,
@@ -141,19 +156,20 @@ def update_table_candidates(index, obj_id):
     obj_id = int(obj_id)
     # cands
     cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index)-1]
+    cands_choice = cands_obj_id[int(index) - 1]
     # data
     df = data.loc[list(cands_choice)].sort_index()
-    df = (df
-          .filter(items=table_feats, axis=1)
-          .sort_values(by='ErstellungsDatum', ascending=True))
-    cols = [{"name": i, "id": i} for i in df.columns]
+    df = df.filter(items=table_feats, axis=1).sort_values(
+        by='ErstellungsDatum', ascending=True
+    )
+    cols = [{'name': i, 'id': i} for i in df.columns]
     # convert dates to strings
     for col in table_feats_dates:
         df[col] = df[col].dt.strftime(r'%Y-%m-%d')
-    
+
     table_data = df.to_dict('records')
     return table_data, cols
 
+
 if __name__ == '__main__':
-    app.run(debug=True)
\ No newline at end of file
+    app.run(debug=True)
diff --git a/test-notebooks/dashboard/data.pkl b/test-notebooks/dashboard/archive/data.pkl
similarity index 100%
rename from test-notebooks/dashboard/data.pkl
rename to test-notebooks/dashboard/archive/data.pkl
diff --git a/test-notebooks/dashboard/map_candidates.pkl b/test-notebooks/dashboard/archive/map_candidates.pkl
similarity index 100%
rename from test-notebooks/dashboard/map_candidates.pkl
rename to test-notebooks/dashboard/archive/map_candidates.pkl
diff --git a/test-notebooks/dashboard/map_texts.pkl b/test-notebooks/dashboard/archive/map_texts.pkl
similarity index 100%
rename from test-notebooks/dashboard/map_texts.pkl
rename to test-notebooks/dashboard/archive/map_texts.pkl
diff --git a/test-notebooks/dashboard/lang_main_config.toml b/test-notebooks/dashboard/lang_main_config.toml
new file mode 100644
index 0000000..c694e25
--- /dev/null
+++ b/test-notebooks/dashboard/lang_main_config.toml
@@ -0,0 +1,56 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+results = './results/test_new2/'
+dataset = './01_2_Rohdaten_neu/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = false
+graph_postprocessing = false
+graph_postprocessing_skip = false
+time_analysis = false
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+input_features = [
+    'VorgangsTypName',
+    'VorgangsArtText',
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
\ No newline at end of file
diff --git a/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
new file mode 100644
index 0000000..bba1d89
Binary files /dev/null and b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl differ
diff --git a/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
new file mode 100644
index 0000000..5565194
Binary files /dev/null and b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl differ
diff --git a/test-notebooks/display_results.ipynb b/test-notebooks/display_results.ipynb
new file mode 100644
index 0000000..ce71331
--- /dev/null
+++ b/test-notebooks/display_results.ipynb
@@ -0,0 +1,663 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from lang_main import load_pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_files(path: str) -> tuple[Path, ...]:\n",
+    "    p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "    assert p.exists(), \"path does not exist\"\n",
+    "    return tuple(p.glob(r'*'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "file = files[-1]\n",
+    "ret = load_pickle(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entry</th>\n",
+       "      <th>len</th>\n",
+       "      <th>num_occur</th>\n",
+       "      <th>assoc_obj_ids</th>\n",
+       "      <th>num_assoc_obj_ids</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>162</th>\n",
+       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
+       "      <td>66</td>\n",
+       "      <td>92592</td>\n",
+       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
+       "      <td>206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
+       "      <td>39</td>\n",
+       "      <td>3108</td>\n",
+       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
+       "      <td>74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>131</th>\n",
+       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
+       "      <td>37</td>\n",
+       "      <td>1619</td>\n",
+       "      <td>[0, 970, 2134, 2137]</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>160</th>\n",
+       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
+       "      <td>36</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>140</th>\n",
+       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
+       "      <td>44</td>\n",
+       "      <td>687</td>\n",
+       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
+       "      <td>166</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2559</th>\n",
+       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
+       "      <td>46</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[211]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2558</th>\n",
+       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
+       "      <td>30</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[93]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2557</th>\n",
+       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
+       "      <td>40</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1707]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2556</th>\n",
+       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
+       "      <td>173</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6782</th>\n",
+       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
+       "      <td>106</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[306, 326]</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4545 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  entry  ... num_assoc_obj_ids\n",
+       "162   Tägliche Wartungstätigkeiten nach Vorgabe des ...  ...               206\n",
+       "33              Wöchentliche Sichtkontrolle / Reinigung  ...                74\n",
+       "131               Tägliche Überprüfung der Ölabscheider  ...                 4\n",
+       "160                Wöchentliche Kontrolle der C-Anlagen  ...                11\n",
+       "140        Halbjährliche Kontrolle des Stabbreithalters  ...               166\n",
+       "...                                                 ...  ...               ...\n",
+       "2559     Fehler 9723 Leistungsversorgung Antrieb defekt  ...                 1\n",
+       "2558                     T-Warp-Let-Off1  schleppfehler  ...                 1\n",
+       "2557           Fahrräder wurden gewartet und gereinigt.  ...                 1\n",
+       "2556  Bohrlöcher an Gebots- und Verbotszeichen anbri...  ...                 1\n",
+       "6782  Befestigung Deckel für Batteriefach defekt    ...  ...                 2\n",
+       "\n",
+       "[4545 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ret[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "file = files[-1]\n",
+    "ret = load_pickle(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>entry</th>\n",
+       "      <th>len</th>\n",
+       "      <th>num_occur</th>\n",
+       "      <th>assoc_obj_ids</th>\n",
+       "      <th>num_assoc_obj_ids</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>162</td>\n",
+       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
+       "      <td>66</td>\n",
+       "      <td>92592</td>\n",
+       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
+       "      <td>206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>33</td>\n",
+       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
+       "      <td>39</td>\n",
+       "      <td>3108</td>\n",
+       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
+       "      <td>74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>131</td>\n",
+       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
+       "      <td>37</td>\n",
+       "      <td>1619</td>\n",
+       "      <td>[0, 970, 2134, 2137]</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>160</td>\n",
+       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
+       "      <td>36</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>140</td>\n",
+       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
+       "      <td>44</td>\n",
+       "      <td>687</td>\n",
+       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
+       "      <td>166</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6756</th>\n",
+       "      <td>2559</td>\n",
+       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
+       "      <td>46</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[211]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6757</th>\n",
+       "      <td>2558</td>\n",
+       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
+       "      <td>30</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[93]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6758</th>\n",
+       "      <td>2557</td>\n",
+       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
+       "      <td>40</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1707]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6759</th>\n",
+       "      <td>2556</td>\n",
+       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
+       "      <td>173</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6760</th>\n",
+       "      <td>6782</td>\n",
+       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
+       "      <td>106</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[306, 326]</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4545 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      index  ... num_assoc_obj_ids\n",
+       "0       162  ...               206\n",
+       "1        33  ...                74\n",
+       "2       131  ...                 4\n",
+       "3       160  ...                11\n",
+       "4       140  ...               166\n",
+       "...     ...  ...               ...\n",
+       "6756   2559  ...                 1\n",
+       "6757   2558  ...                 1\n",
+       "6758   2557  ...                 1\n",
+       "6759   2556  ...                 1\n",
+       "6760   6782  ...                 2\n",
+       "\n",
+       "[4545 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ret[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "cbf0b450-ec00-471f-9627-717e52c5471d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tup = tuple(i for i in range(100000000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6ade9c6f4e61410fb93f35e43222705b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/100000000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "num = 0\n",
+    "for i in tqdm(tup):\n",
+    "    num += i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4999999950000000"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36366147-3632-4518-936e-878563305e49",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "17156aa0-8fd6-407b-b014-698df0e534a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arr = np.random.rand(1000,1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = np.argwhere(arr >= 0.97)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "30220"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "5b78436e-a828-42bd-a5ed-ae6045349391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = idx[:200]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "75edc50e-b64c-4319-8f74-27653ed3452c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tuple(map(tuple, batch))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tuple(tuple(x) for x in batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}