STRF for similarity duplicates, time analysis pipeline, enhanced config

2024-05-29 16:34:31 +02:00 · 2024-05-29 16:34:31 +02:00 · bb987e2108
commit bb987e2108
parent 5d2c97165a
30 changed files with 1875 additions and 693 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,3 +34,15 @@ trials = [
    "plotly>=5.22.0",
    "dash>=2.17.0",
 ]
+
+[tool.ruff]
+line-length = 94
+indent-width = 4
+target-version = "py311"
+
+[tool.ruff.format]
+quote-style = "single"
+skip-magic-trailing-comma = false
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@ -1,33 +1,43 @@
 import typing
+import warnings
+from pathlib import Path
+from typing import cast

-from pandas import DataFrame, Series
-
-from ihm_analyse import (
-    SAVE_PATH_FOLDER,
-    PATH_TO_DATASET,
-    THRESHOLD_AMOUNT_CHARACTERS,
-    THRESHOLD_EDGE_WEIGHT,
-    DO_PREPROCESSING,
-    DO_TOKEN_ANALYSIS,
-    DO_GRAPH_POSTPROCESSING,
+from lang_main import (
+    TokenGraph,
    create_saving_folder,
    load_pickle,
-    Embedding,
-    Index,
-    TokenGraph,
 )
-from ihm_analyse.predefined_pipes import (
-    pipe_target_feat,
-    pipe_embds,
+from lang_main.constants import (
+    DO_GRAPH_POSTPROCESSING,
+    DO_PREPROCESSING,
+    DO_TIME_ANALYSIS,
+    DO_TOKEN_ANALYSIS,
+    INPUT_PATH_FOLDER,
+    PATH_TO_DATASET,
+    SAVE_PATH_FOLDER,
+    SKIP_GRAPH_POSTPROCESSING,
+    SKIP_PREPROCESSING,
+    SKIP_TIME_ANALYSIS,
+    SKIP_TOKEN_ANALYSIS,
+    THRESHOLD_AMOUNT_CHARACTERS,
+    THRESHOLD_EDGE_WEIGHT,
+)
+
+# Embedding,
+# PandasIndex,
+from lang_main.pipelines.predefined import (
    pipe_merge,
+    pipe_target_feat,
+    pipe_timeline,
    pipe_token_analysis,
 )
-"""
-# ** config parameters
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
-PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
-"""
+from lang_main.types import (
+    ObjectID,
+    TimelineCandidates,
+)
+from pandas import DataFrame, Series
+

 # ** processing pipeline
 def run_preprocessing() -> DataFrame:
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
        overwrite_existing=True,
    )
    # run pipelines
-    ret = typing.cast(tuple[DataFrame], 
-                      pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
+    ret = typing.cast(
+        tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
+    )
    target_feat_data = ret[0]
    # only entries with more than threshold amount of characters
-    data_filter = typing.cast(Series,
-                              (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-    subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
-    dupl_idx_pairs, embds = typing.cast(
-        tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
-        pipe_embds.run(starting_values=(subset_data,))
-    )
+    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
+    # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
+    # dupl_idx_pairs, embds = typing.cast(
+    #     tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
+    #     pipe_embds.run(starting_values=(subset_data,)),
+    # )
    # merge duplicates, results saved separately
-    ret = typing.cast(tuple[DataFrame],
-                      pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
+    subset_data = target_feat_data.loc[data_filter].copy()
+    ret = typing.cast(
+        tuple[DataFrame],
+        # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
+        pipe_merge.run(starting_values=(subset_data,)),
+    )
    preprocessed_data = ret[0]

    return preprocessed_data

+
 def run_token_analysis(
    preprocessed_data: DataFrame,
 ) -> TokenGraph:
    # build token graph
-    (tk_graph,) = typing.cast(tuple[TokenGraph],
-                                 pipe_token_analysis.run(starting_values=(preprocessed_data,)))
+    (tk_graph,) = typing.cast(
+        tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
+    )
    tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
-    tk_graph.to_pickle(SAVE_PATH_FOLDER, 
-                     filename=f'{pipe_token_analysis.name}-TokenGraph')
+    tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')

    return tk_graph

+
 def run_graph_postprocessing(
    tk_graph: TokenGraph,
 ) -> TokenGraph:
    # filter graph by edge weight and remove single nodes (no connection)
    tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
    tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
-    tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
-                                 filename='TokenGraph-filtered',
-                                 directed=False)
-    tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
-                     filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
+    tk_graph_filtered.save_graph(
+        SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
+    )
+    tk_graph_filtered.to_pickle(
+        SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
+    )

    return tk_graph_filtered

-if __name__ == '__main__':
+
+def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
+    filename = 'without_nan'
+    loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+    verify_path(loading_path)
+    ret = load_pickle(loading_path)
+    preprocessed_data = ret[0]
+
+    ret = cast(
+        tuple[TimelineCandidates, dict[ObjectID, str]],
+        pipe_timeline.run(starting_values=(preprocessed_data,)),
+    )
+    return ret
+
+
+def verify_path(
+    loading_path: Path,
+) -> None:
+    if not loading_path.exists():
+        raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
+
+
+def main() -> None:
+    pre_step_skipped: bool = False
    # ** preprocess
-    if DO_PREPROCESSING:
+    if DO_PREPROCESSING and not SKIP_PREPROCESSING:
        preprocessed_data = run_preprocessing()
-    else:
+    elif not SKIP_PREPROCESSING:
        # !! hardcoded result filenames
        target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
-        target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
-        ret = typing.cast(tuple[DataFrame],
-                          load_pickle(target_filepath))
+        loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
+        verify_path(loading_path)
+        ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
        preprocessed_data = ret[0]
-    # ** token analysis
-    if DO_TOKEN_ANALYSIS:
-        preprocessed_data_trunc = typing.cast(DataFrame, 
-                                            preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
-        tk_graph = run_token_analysis(preprocessed_data_trunc)
    else:
+        pre_step_skipped = True
+        warnings.warn('No preprocessing action selected. Skipped.')
+    # sys.exit(0)
+    # ** token analysis
+    if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
+        if pre_step_skipped:
+            raise RuntimeError(
+                'Preprocessing step skipped. Token analysis cannot be performed.'
+            )
+        preprocessed_data_trunc = typing.cast(
+            DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
+        )  # type: ignore
+        tk_graph = run_token_analysis(preprocessed_data_trunc)
+    elif not SKIP_TOKEN_ANALYSIS:
        # !! hardcoded result filenames
        # whole graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+        verify_path(loading_path)
        # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph = TokenGraph.from_pickle(loading_path)
-    # ** graph postprocessing
-    if DO_GRAPH_POSTPROCESSING:
-        tk_graph_filtered = run_graph_postprocessing(tk_graph)
+        pre_step_skipped = False
    else:
+        pre_step_skipped = True
+        warnings.warn('No token analysis action selected. Skipped.')
+    # ** graph postprocessing
+    if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
+        if pre_step_skipped:
+            raise RuntimeError(
+                (
+                    'Preprocessing or token analysis step skipped. '
+                    'Graph postprocessing cannot be performed.'
+                )
+            )
+        tk_graph_filtered = run_graph_postprocessing(tk_graph)
+    elif not SKIP_GRAPH_POSTPROCESSING:
        # !! hardcoded result filenames
        # filtered graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
+        verify_path(loading_path)
        # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
+        pre_step_skipped = False
+    else:
+        warnings.warn('No graph postprocessing action selected. Skipped.')
+    # ** time analysis
+    if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
+        # no check for fails, runs separately
+        ret = run_time_analysis()
+    elif not SKIP_TIME_ANALYSIS:
+        ...
+    else:
+        warnings.warn('No time analysis action selected. Skipped.')
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/inputs/without_nan.pkl
+++ b/scripts/inputs/without_nan.pkl
--- a/src/lang_main/config.toml
+++ b/src/lang_main/config.toml
@ -1,17 +1,21 @@
 # lang_main: Config file

 [paths]
-results = './results/test_new2/'
-dataset = './01_2_Rohdaten_neu/Export4.csv'
+inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
+results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
+dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'

 [control]
-preprocessing = false
-token_analysis = true
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = true
 graph_postprocessing = false
+graph_postprocessing_skip = true

 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
--- a/scripts/lang_main_config.toml
+++ b/scripts/lang_main_config.toml
@ -0,0 +1,59 @@
+# lang_main: Config file
+
+[paths]
+inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
+results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
+dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = true
+token_analysis = false
+token_analysis_skip = true
+graph_postprocessing = false
+graph_postprocessing_skip = true
+time_analysis = true
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+# input_features = [
+#     'VorgangsTypName',
+#     'VorgangsArtText',
+#     'VorgangsBeschreibung',
+# ]
+input_features = [
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
--- a/scripts/test.py
+++ b/scripts/test.py
@ -0,0 +1,12 @@
+from lang_main.analysis.preprocessing import clean_string_slim
+from lang_main.constants import SAVE_PATH_FOLDER
+
+print(SAVE_PATH_FOLDER)
+txt = """
+Wir feiern den Jahrestag, olé!
+tel:::: !!!!???? +++49 123 456 789
+
+Doch leben wir länger.
+"""
+print(txt)
+print(clean_string_slim(txt))
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -1,18 +1,19 @@
-from typing import Final, Any
 import inspect
-import sys
 import logging
-from time import gmtime
+import shutil
+import sys
 from pathlib import Path
+from time import gmtime
+from typing import Any, Final

-from lang_main.shared import (
-    save_pickle, 
-    load_pickle, 
-    create_saving_folder,
-    load_toml_config,
-)
-from lang_main.analysis.preprocessing import Embedding, PandasIndex
 from lang_main.analysis.graphs import TokenGraph
+from lang_main.analysis.preprocessing import Embedding, PandasIndex
+from lang_main.shared import (
+    create_saving_folder,
+    load_pickle,
+    load_toml_config,
+    save_pickle,
+)

 __all__ = [
    'save_pickle',
@ -32,37 +33,30 @@ logging.basicConfig(
    datefmt=LOG_DATE_FMT,
 )

-USE_INTERNAL_CONFIG: Final[bool] = True
+CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
+USE_INTERNAL_CONFIG: Final[bool] = False
+
+pkg_dir = Path(__file__).parent
+cfg_path_internal = pkg_dir / CONFIG_FILENAME

 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
-    curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
-    pkg_dir = curr_file_dir.parent
-    config_path = Path(pkg_dir, 'config.toml')
-    loaded_config = load_toml_config(path_to_toml=config_path)
-    CONFIG: Final[dict[str, Any]] = loaded_config.copy()
+    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
 else:
-    raise NotImplementedError("External config data not implemented yet.")
+    caller_file = Path(inspect.stack()[-1].filename)
+    if not caller_file.exists():
+        raise FileNotFoundError('Caller file could not be correctly retrieved.')
+    cfg_path_external = caller_file.parent / CONFIG_FILENAME
+    if not cfg_path_external.exists():
+        shutil.copy(cfg_path_internal, cfg_path_external)
+        sys.exit(
+            (
+                'No config file was found. A new one with default values was created '
+                'in the execution path. Please fill in the necessary values and '
+                'restart the programm.'
+            )
+        )
+    # raise NotImplementedError("External config data not implemented yet.")
+    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)

-# ** paths
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
-PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
-# ** control
-DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
-DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
-DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
-# ** export
-
-# ** preprocessing
-FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
-    CONFIG['preprocess']['filename_cossim_filter_candidates']
-DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
-    CONFIG['preprocess']['threshold_amount_characters']
-THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
-# ** token analysis
-
-# ** graph postprocessing
-THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
-# ** time analysis
-THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
+CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -1,18 +1,18 @@
-import typing
-from typing import Any, Self, Literal, overload, Final
-import sys
-from collections.abc import Hashable
-from pathlib import Path
 import copy
+import sys
+import typing
+from collections.abc import Hashable, Iterable
+from pathlib import Path
+from typing import Any, Final, Literal, Self, overload

+import networkx as nx
 import numpy as np
 import numpy.typing as npt
-from networkx import Graph, DiGraph
-import networkx as nx
+from networkx import DiGraph, Graph
 from pandas import DataFrame

 from lang_main.loggers import logger_graphs as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle

 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False
@ -31,8 +31,7 @@ def get_graph_metadata(
    min_edge_weight: int = 1_000_000
    max_edge_weight: int = 0
    for edge in graph.edges:
-        weight = typing.cast(int,
-                             graph[edge[0]][edge[1]]['weight'])
+        weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
        if weight < min_edge_weight:
            min_edge_weight = weight
        if weight > max_edge_weight:
@ -54,18 +53,20 @@ def get_graph_metadata(
    )

    if logging:
-        logger.info((f"Graph properties: {num_nodes} Nodes, "
-                    f"{num_edges} Edges"))
-        logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
-        logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
-        logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
+        logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
+        logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
+        logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
+        logger.info(f'Total memory: {total_mem / 1024:.2f} KB')

    return graph_info

+
 def update_graph(
    graph: Graph | DiGraph,
-    parent: Hashable,
-    child: Hashable,
+    *,
+    batch: Iterable[tuple[Hashable, Hashable]] | None = None,
+    parent: Hashable | None = None,
+    child: Hashable | None = None,
    weight_connection: int = 1,
 ) -> None:
    # !! not necessary to check for existence of nodes
@ -78,7 +79,9 @@ def update_graph(
        graph.add_node(child)
    """
    # check if edge not in Graph
-    if not graph.has_edge(parent, child):
+    if batch is not None:
+        graph.add_edges_from(batch, weight=weight_connection)
+    elif not graph.has_edge(parent, child):
        # create new edge, nodes will be created if not already present
        graph.add_edge(parent, child, weight=weight_connection)
    else:
@ -87,16 +90,15 @@ def update_graph(
        weight += weight_connection
        graph[parent][child]['weight'] = weight

+
 # build undirected adjacency matrix
 def convert_graph_to_undirected(
    graph: DiGraph,
    logging: bool = LOGGING_DEFAULT,
 ) -> Graph:
    # get adjacency matrix
-    adj_mat = typing.cast(DataFrame, 
-                          nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
-    arr = typing.cast(npt.NDArray[np.uint32],
-                      adj_mat.to_numpy())
+    adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
+    arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
    # build undirected array: adding edges of lower triangular matrix to upper one
    arr_upper = np.triu(arr)
    arr_lower = np.tril(arr)
@ -104,18 +106,17 @@ def convert_graph_to_undirected(
    arr_new = arr_upper + arr_lower
    # assign new data and create graph
    adj_mat.loc[:] = arr_new  # type: ignore
-    graph_undir = typing.cast(Graph,
-                              nx.from_pandas_adjacency(df=adj_mat))
+    graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))

    # info about graph
    if logging:
-        logger.info("Successfully converted graph to one with undirected edges.")
+        logger.info('Successfully converted graph to one with undirected edges.')
    _ = get_graph_metadata(graph=graph_undir, logging=logging)

    return graph_undir

-class TokenGraph(DiGraph):

+class TokenGraph(DiGraph):
    def __init__(
        self,
        name: str = 'TokenGraph',
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
        return self.__str__()

    def __str__(self) -> str:
-        return (f"TokenGraph(name: {self.name}, number of nodes: "
-                f"{len(self.nodes)}, number of edges: "
-                f"{len(self.edges)})")
+        return (
+            f'TokenGraph(name: {self.name}, number of nodes: '
+            f'{len(self.nodes)}, number of edges: '
+            f'{len(self.edges)})'
+        )

    # !! only used to verify that saving was done correctly
    """
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
        self,
        inplace: Literal[True] = ...,
        logging: bool | None = ...,
-    ) -> None:
-        ...
+    ) -> None: ...

    @overload
    def to_undirected(
        self,
        inplace: Literal[False],
        logging: bool | None = ...,
-    ) -> Graph:
-        ...
+    ) -> Graph: ...

    @overload
    def to_undirected(
-        self, 
-        inplace: bool = ..., 
-        logging: bool | None = ...
-    ) -> Graph | None:
-        ...
+        self, inplace: bool = ..., logging: bool | None = ...
+    ) -> Graph | None: ...

    def to_undirected(
        self,
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
        if logging is None:
            logging = self.logging

-        self._undirected = convert_graph_to_undirected(graph=self, 
-                                                       logging=logging)
-        self._metadata_undirected = get_graph_metadata(graph=self._undirected,
-                                                       logging=logging)
+        self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
+        self._metadata_undirected = get_graph_metadata(
+            graph=self._undirected, logging=logging
+        )
        if not inplace:
            return self._undirected

@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
        if logging is None:
            logging = self.logging

-        self._metadata_directed = get_graph_metadata(graph=self,
-                                                     logging=logging)
+        self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
        if self._undirected is not None:
-            self._metadata_undirected = get_graph_metadata(graph=self._undirected,
-                                                           logging=logging)
+            self._metadata_undirected = get_graph_metadata(
+                graph=self._undirected, logging=logging
+            )

    def filter_by_edge_weight(
        self,
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
        filtered_graph = self.copy()

        for edge in original_graph_edges:
-            weight = typing.cast(int,
-                                 filtered_graph[edge[0]][edge[1]]['weight'])
+            weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
            if weight < threshold:
                filtered_graph.remove_edge(edge[0], edge[1])

@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
        filename: str | None = None,
    ) -> Path:
        if filename is not None:
-            saving_path = path.joinpath(f"{filename}")
+            saving_path = path.joinpath(f'{filename}')
        else:
-            saving_path = path.joinpath(f"{self.name}")
+            saving_path = path.joinpath(f'{self.name}')

        return saving_path

@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
        elif not directed and self._undirected is not None:
            target_graph = self._undirected
        else:
-            raise ValueError("No undirected graph available.")
+            raise ValueError('No undirected graph available.')

        saving_path = saving_path.with_suffix('.graphml')
        nx.write_graphml(G=target_graph, path=saving_path)
-        logger.info(("Successfully saved graph as GraphML file "
-                        f"under {saving_path}."))
+        logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))

    def to_pickle(
        self,
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
        match path.suffix:
            case '.graphml':
                graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
-                logger.info(f"Successfully loaded graph from GraphML file {path}.")
+                logger.info(f'Successfully loaded graph from GraphML file {path}.')
            case '.pkl' | '.pickle':
                graph = typing.cast(Self, load_pickle(path))
-                logger.info(f"Successfully loaded graph from pickle file {path}.")
+                logger.info(f'Successfully loaded graph from pickle file {path}.')
            case _:
-                raise ValueError("File format not supported.")
+                raise ValueError('File format not supported.')

        return graph

@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
            path = Path(path)

        if path.suffix not in ('.pkl', '.pickle'):
-            raise ValueError("File format not supported.")
+            raise ValueError('File format not supported.')

        graph = typing.cast(Self, load_pickle(path))

--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,29 +1,29 @@
-from typing import cast, Callable
+import re
 from collections.abc import Iterable
 from itertools import combinations
-import re
 from math import factorial
 from pathlib import Path
+from typing import Callable, cast

 import numpy as np
-from torch import Tensor
-from pandas import DataFrame, Series
 import pandas as pd
-from spacy.lang.de import German as GermanSpacyModel
-from spacy.tokens.doc import Doc as SpacyDoc
-from sentence_transformers import SentenceTransformer
 import sentence_transformers
 import sentence_transformers.util
+from pandas import DataFrame, Series
+from sentence_transformers import SentenceTransformer
+from spacy.lang.de import German as GermanSpacyModel
+from spacy.tokens.doc import Doc as SpacyDoc
+from torch import Tensor
 from tqdm import tqdm

-from lang_main.types import Embedding, PandasIndex
-from lang_main.loggers import logger_preprocess as logger
-from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.shared import (
+    candidates_by_index,
    similar_index_connection_graph,
    similar_index_groups,
 )
-#from lang_main.analysis.graphs import update_graph, get_graph_metadata
+from lang_main.loggers import logger_preprocess as logger
+from lang_main.pipelines.base import BasePipeline
+from lang_main.types import Embedding, PandasIndex


 # ** (1) dataset preparation: loading and simple preprocessing
@ -67,11 +67,16 @@ def load_raw_data(
        parse_dates=date_cols,
        dayfirst=True,
    )
-    logger.info("Loaded dataset successfully.")
-    logger.info((f"Dataset properties: number of entries: {len(data)}, "
-                 f"number of features {len(data.columns)}"))
+    logger.info('Loaded dataset successfully.')
+    logger.info(
+        (
+            f'Dataset properties: number of entries: {len(data)}, '
+            f'number of features {len(data.columns)}'
+        )
+    )
    return (data,)

+
 def remove_duplicates(
    data: DataFrame,
 ) -> tuple[DataFrame]:
@ -89,7 +94,7 @@ def remove_duplicates(
    """
    # obtain info about duplicates over all features
    duplicates_filt = data.duplicated()
-    logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
+    logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
    # drop duplicates
    wo_duplicates = data.drop_duplicates(ignore_index=True)
    duplicates_subset: list[str] = [
@ -97,16 +102,26 @@ def remove_duplicates(
        'ObjektID',
    ]
    duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
-    logger.info(("Number of duplicates over subset " 
-                 f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
-    wo_duplicates =\
-        wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
-    logger.info("Removed all duplicates from dataset successfully.")
-    logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
-                 f"number of features {len(wo_duplicates.columns)}"))
+    logger.info(
+        (
+            'Number of duplicates over subset '
+            f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
+        )
+    )
+    wo_duplicates = wo_duplicates.drop_duplicates(
+        subset=duplicates_subset, ignore_index=True
+    ).copy()
+    logger.info('Removed all duplicates from dataset successfully.')
+    logger.info(
+        (
+            f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
+            f'number of features {len(wo_duplicates.columns)}'
+        )
+    )

    return (wo_duplicates,)

+
 def remove_NA(
    data: DataFrame,
    target_features: list[str] = [
@ -128,15 +143,16 @@ def remove_NA(
        dataset with removed NA entries for given subset of features
    """
    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
-    logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
+    logger.info(
+        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
+    )

    return (wo_NA,)

+
 # ** (2) entry-based cleansing
 # following functions clean and prepare specific entries, not whole dataset
-def clean_string_slim(
-    string: str
-) -> str:
+def clean_string_slim(string: str) -> str:
    """mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features

@ -151,13 +167,16 @@ def clean_string_slim(
        cleaned entry
    """
    # remove special chars
-    pattern = r'[\t\n\r\f\v]'
+    pattern = r'[\t\n\r\f\v]+'
    string = re.sub(pattern, ' ', string)
+    pattern = r'([,;.:!?-_\+]){2,}'
    # remove whitespaces at the beginning and the end
+    string = re.sub(pattern, r'\1', string)
    string = string.strip()

    return string

+
 def entry_wise_cleansing(
    data: DataFrame,
    target_feature: str,
@ -165,10 +184,16 @@ def entry_wise_cleansing(
 ) -> tuple[DataFrame]:
    # apply given cleansing function to target feature
    data[target_feature] = data[target_feature].map(cleansing_func)
-    logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
-                 f"for feature >>{target_feature}<<"))
+    logger.info(
+        (
+            f'Successfully applied entry-wise cleansing procedure '
+            f'>>{cleansing_func.__name__}<< '
+            f'for feature >>{target_feature}<<'
+        )
+    )
    return (data,)

+
 # ** in-depth analysis of one feature
 # following functions try to gain insights on a given feature of the IHM dataset such
 # as number of occurrences or associated Object IDs
@ -178,7 +203,7 @@ def analyse_feature(
 ) -> tuple[DataFrame]:
    # feature columns
    feature_entries = data[target_feature]
-    logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
+    logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
    # obtain unique entries
    unique_feature_entries = feature_entries.unique()

@ -186,7 +211,7 @@ def analyse_feature(
    cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
    result_df = pd.DataFrame(columns=cols)

-    for entry in tqdm(unique_feature_entries, mininterval=1.):
+    for entry in tqdm(unique_feature_entries, mininterval=1.0):
        len_entry = len(entry)
        filt = data[target_feature] == entry
        temp = data[filt]
@ -195,13 +220,10 @@ def analyse_feature(
        num_assoc_obj_ids = len(assoc_obj_ids)
        num_dupl = filt.sum()

-        conc_df = pd.DataFrame(data=[[
-                                entry,
-                                len_entry,
-                                num_dupl,
-                                assoc_obj_ids,
-                                num_assoc_obj_ids
-                            ]], columns=cols)
+        conc_df = pd.DataFrame(
+            data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
+            columns=cols,
+        )

        result_df = pd.concat([result_df, conc_df], ignore_index=True)

@ -230,9 +252,9 @@ def build_embedding_map(
        is_STRF = True

    if not any((is_spacy, is_STRF)):
-        raise NotImplementedError("Model type unknown")
+        raise NotImplementedError('Model type unknown')

-    for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
+    for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
        # verbose code: Pyright not inferring types correctly
        idx = cast(int, idx)
        text = cast(str, text)
@ -246,12 +268,17 @@ def build_embedding_map(
                logger.debug(f'{embd.text=} has no vector')
        elif is_STRF:
            model = cast(SentenceTransformer, model)
-            embd = cast(Tensor, 
-                        model.encode(text, show_progress_bar=False))
+            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
            embeddings[idx] = (embd, text)

    return embeddings, (is_spacy, is_STRF)

+
+# adapt interface
+# use candidates by index function
+# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
+
+
 # build similarity matrix out of embeddings
 def build_cosSim_matrix(
    data: Series,
@ -259,10 +286,11 @@ def build_cosSim_matrix(
 ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
    # build empty matrix
    df_index = data.index
-    cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index, 
-                                    index=df_index, dtype=np.float32)
+    cosineSim_idx_matrix = pd.DataFrame(
+        data=0.0, columns=df_index, index=df_index, dtype=np.float32
+    )

-    logger.info("Start building embedding map...")
+    logger.info('Start building embedding map...')

    # obtain embeddings based on used model
    embds, (is_spacy, is_STRF) = build_embedding_map(
@ -270,15 +298,15 @@ def build_cosSim_matrix(
        model=model,
    )

-    logger.info("Embedding map built successfully.")
+    logger.info('Embedding map built successfully.')

    # apply index based mapping for efficient handling of large texts
    combs = combinations(df_index, 2)
    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)

-    logger.info("Start calculation of similarity scores...")
+    logger.info('Start calculation of similarity scores...')

-    for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
+    for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
        # print(f"{idx1=}, {idx2=}")
        embd1 = embds[idx1][0]
        embd2 = embds[idx2][0]
@ -296,10 +324,11 @@ def build_cosSim_matrix(

        cosineSim_idx_matrix.at[idx1, idx2] = cosSim

-    logger.info("Similarity scores calculated successfully.")
+    logger.info('Similarity scores calculated successfully.')

    return cosineSim_idx_matrix, embds

+
 # obtain index pairs with cosine similarity
 # greater than or equal to given threshold value
 def filt_thresh_cosSim_matrix(
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
    Series
        series with multi index (index pairs) and corresponding similarity score
    """
-    cosineSim_filt = cast(Series, 
-                          cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
+    cosineSim_filt = cast(
+        Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
+    )

    return cosineSim_filt, embds

+
 def list_cosSim_dupl_candidates(
    cosineSim_filt: Series,
    embds: dict[int, tuple[Embedding, str]],
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
        list containing relevant index pairs for entries with similarity score greater than
        given threshold
    """
-    logger.info("Start gathering of similarity candidates...")
+    logger.info('Start gathering of similarity candidates...')
    # compare found duplicates
    columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)

    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []

-    for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
+    for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
        # get text content from embedding as second tuple entry
-        content = [[
+        content = [
+            [
                idx1,
                embds[idx1][1],
                idx2,
                embds[idx2][1],
                score,
-        ]]
+            ]
+        ]
        # add candidates to collection DataFrame
        df_conc = pd.DataFrame(columns=columns, data=content)
        if df_candidates.empty:
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
        # save index pairs
        index_pairs.append((idx1, idx2))

-    logger.info("Similarity candidates gathered successfully.")
+    logger.info('Similarity candidates gathered successfully.')

    if save_candidates:
        if saving_path is None:
-            raise ValueError(("Saving path must be provided if duplicate "
-                          "candidates should be saved."))
+            raise ValueError(
+                ('Saving path must be provided if duplicate ' 'candidates should be saved.')
+            )
        elif pipeline is not None:
-            target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' 
-                            + filename + '.xlsx')
+            target_filename = (
+                f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
+            )
        elif pipeline is None:
            target_filename = f'{filename}.xlsx'
-        logger.info("Saving similarity candidates...")
+        logger.info('Saving similarity candidates...')
        target_path = saving_path.joinpath(target_filename)
        df_candidates.to_excel(target_path)
-        logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
+        logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')

    return index_pairs, embds

+
 # TODO: change implementation fully to SentenceTransformer
 # usage of batch processing for embeddings, use candidate idx function
 # from time analysis --> moved to ``helpers.py``
@ -419,16 +455,24 @@ def similar_ids_groups(
        yield list(id_group)
 """

+
 def merge_similarity_dupl(
    data: DataFrame,
-    similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
+    model: SentenceTransformer,
+    cos_sim_threshold: float,
 ) -> tuple[DataFrame]:
-    logger.info("Start merging of similarity candidates...")
+    logger.info('Start merging of similarity candidates...')

    # data
    merged_data = data.copy()
+    model_input = merged_data['entry']
+    candidates_idx = candidates_by_index(
+        data_model_input=model_input,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+    )
    # graph of similar ids
-    similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
+    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)

    for similar_id_group in similar_index_groups(similar_id_graph):
        similar_id_group = list(similar_id_group)
@ -454,10 +498,11 @@ def merge_similarity_dupl(
        merged_data.update(merged_similar_data)
        merged_data = merged_data.drop(index=similar_id_group)

-    logger.info("Similarity candidates merged successfully.")
+    logger.info('Similarity candidates merged successfully.')

    return (merged_data.copy(),)

+
 # merge duplicates
 def merge_similarity_dupl_old(
    data: DataFrame,
@ -469,8 +514,7 @@ def merge_similarity_dupl_old(
    # logger.info("Start merging of similarity candidates...")

    # iterate over index pairs
-    for (i1, i2) in tqdm(dupl_idx_pairs):
-    
+    for i1, i2 in tqdm(dupl_idx_pairs):
        # if an entry does not exist any more, skip this pair
        if i1 not in index or i2 not in index:
            continue
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
        given threshold
    """

-    
    # compare found duplicates
    columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)

    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []

-    for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
+    for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
        # get texts for comparison
        text1 = embds[idx1][1]
        text2 = embds[idx2][1]
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
            continue

        # get text content from embedding as second tuple entry
-        content = [[
+        content = [
+            [
                idx1,
                text1,
                idx2,
                text2,
                score,
-        ]]
+            ]
+        ]
        df_conc = pd.DataFrame(columns=columns, data=content)

        df_candidates = pd.concat([df_candidates, df_conc])
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -1,11 +1,71 @@
-from typing import cast
 from collections.abc import Iterable, Iterator
+from typing import cast

 import networkx as nx
+import numpy as np
+import numpy.typing as npt
+import sentence_transformers
+import sentence_transformers.util
 from networkx import Graph
+from pandas import Series
+from sentence_transformers import SentenceTransformer
+from torch import Tensor
+from tqdm.auto import tqdm

+from lang_main.analysis.graphs import get_graph_metadata, update_graph
 from lang_main.types import PandasIndex
-from lang_main.analysis.graphs import update_graph, get_graph_metadata
+
+
+def candidates_by_index(
+    data_model_input: Series,
+    model: SentenceTransformer,
+    cos_sim_threshold: float = 0.5,
+    # ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
+) -> Iterator[tuple[PandasIndex, PandasIndex]]:
+    """function to filter candidate indices based on cosine similarity
+    using SentenceTransformer model in batch mode,
+    feed data as Series to retain information about indices of entries and
+    access them later in the original dataset
+
+    Parameters
+    ----------
+    obj_id : ObjectID
+        _description_
+    data_model_input : Series
+        containing indices and text entries to process
+    model : SentenceTransformer
+        necessary SentenceTransformer model to encode text entries
+    cos_sim_threshold : float, optional
+        threshold for cosine similarity to filter candidates, by default 0.5
+
+    Yields
+    ------
+    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
+        ObjectID and tuple of index pairs which meet the cosine
+        similarity threshold
+    """
+    # embeddings
+    batch = cast(list[str], data_model_input.to_list())
+    embds = cast(
+        Tensor,
+        model.encode(
+            batch,
+            convert_to_numpy=False,
+            convert_to_tensor=True,
+            show_progress_bar=False,
+        ),
+    )
+    # cosine similarity
+    cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
+    np.fill_diagonal(cos_sim, 0.0)
+    cos_sim = np.triu(cos_sim)
+    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
+
+    for idx_array in cos_sim_idx:
+        idx_pair = cast(
+            tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
+        )
+        yield idx_pair


 def similar_index_connection_graph(
@ -15,21 +75,21 @@ def similar_index_connection_graph(
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
-    for (idx1, idx2) in similar_idx_pairs:
-        # inplace operation, parent/child do not really exist in undirected graph
-        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
+    # for idx1, idx2 in similar_idx_pairs:
+    #     # inplace operation, parent/child do not really exist in undirected graph
+    #     update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
+    update_graph(graph=similar_id_graph, batch=similar_idx_pairs)

    graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)

    return similar_id_graph, graph_info

-# TODO check returning tuple
+
 def similar_index_groups(
    similar_id_graph: Graph,
 ) -> Iterator[tuple[PandasIndex, ...]]:
    # groups of connected indices
-    ids_groups = cast(Iterator[set[PandasIndex]],
-                      nx.connected_components(G=similar_id_graph))
+    ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))

    for id_group in ids_groups:
        yield tuple(id_group)
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -1,21 +1,17 @@
-from typing import cast
 from collections.abc import Iterable, Iterator
+from typing import cast

-import numpy as np
-import numpy.typing as npt
 from pandas import DataFrame, Series
-from torch import Tensor
 from sentence_transformers import SentenceTransformer
-import sentence_transformers
-import sentence_transformers.util
 from tqdm.auto import tqdm  # TODO: check deletion

-from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
-from lang_main.loggers import logger_timeline as logger
 from lang_main.analysis.shared import (
+    candidates_by_index,
    similar_index_connection_graph,
    similar_index_groups,
 )
+from lang_main.loggers import logger_timeline as logger
+from lang_main.types import ObjectID, PandasIndex, TimelineCandidates


 def non_relevant_obj_ids(
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
 ) -> tuple[ObjectID, ...]:
-    
    data = data.copy()
    ids_to_ignore: set[ObjectID] = set()
-    obj_ids = cast(Iterable[ObjectID], # actually NumPy array
-                   data[feature_obj_id].unique())
+    obj_ids = cast(
+        Iterable[ObjectID],  # actually NumPy array
+        data[feature_obj_id].unique(),
+    )

    for obj_id in obj_ids:
        feats_per_obj_id = cast(
-            Series,
-            data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
+            Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
        )
        # check for uniqueness of given feature for current ObjectID
        # ignore NaN values
@ -46,14 +42,15 @@ def non_relevant_obj_ids(

    return tuple(ids_to_ignore)

+
 def remove_non_relevant_obj_ids(
    data: DataFrame,
    thresh_unique_feat_per_id: int,
    *,
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
-) -> DataFrame:
-    logger.info("Removing non-relevant ObjectIDs from dataset")
+) -> tuple[DataFrame]:
+    logger.info('Removing non-relevant ObjectIDs from dataset')
    data = data.copy()
    ids_to_ignore = non_relevant_obj_ids(
        data=data,
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
    )
    # only retain entries with ObjectIDs not in IDs to ignore
    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
-    logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
-    logger.info("Non-relevant ObjectIDs removed successfully")
+    logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
+    logger.info('Non-relevant ObjectIDs removed successfully')

-    return data
+    return (data,)

-def filter_activities_per_obj_id(
-    data: DataFrame,
-    activity_feature: str = 'VorgangsTypName',
-    relevant_activity_types: Iterable[str] = (
-        'Reparaturauftrag (Portal)',
-    ),
-    feature_obj_id: str = 'ObjektID',
-    threshold_num_activities: int = 1,
-) -> tuple[DataFrame, Series]:
-    data = data.copy()
-    # filter only relevant activities count occurrences for each ObjectID
-    logger.info("Filtering activities per ObjectID")
-    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
-    data_filter_activities = data.loc[filt_rel_activities].copy()
-    num_activities_per_obj_id = cast(
-        Series,
-        data_filter_activities[feature_obj_id].value_counts(sort=True)
-    )
-    # filter for ObjectIDs with more than given number of activities
-    filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
-    # index of series contains ObjectIDs
-    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
-    filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
-                                 .isin(obj_ids_below_thresh))
-    
-    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
-    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
-    logger.info("Activities per ObjectID filtered successfully")
-    
-    return data_filter_activities, num_activities_per_obj_id

 def generate_model_input(
    data: DataFrame,
@ -107,8 +74,8 @@ def generate_model_input(
        'VorgangsArtText',
        'VorgangsBeschreibung',
    ),
-) -> DataFrame:
-    logger.info("Generating concatenation of model input features")
+) -> tuple[DataFrame]:
+    logger.info('Generating concatenation of model input features')
    data = data.copy()
    model_input_features = list(model_input_features)
    input_features = data[model_input_features].fillna('').astype(str)
@ -116,9 +83,40 @@ def generate_model_input(
        lambda x: ' - '.join(x),
        axis=1,
    )
-    logger.info("Model input generated successfully")
+    logger.info('Model input generated successfully')
+
+    return (data,)
+
+
+def filter_activities_per_obj_id(
+    data: DataFrame,
+    activity_feature: str = 'VorgangsTypName',
+    relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
+    feature_obj_id: str = 'ObjektID',
+    threshold_num_activities: int = 1,
+) -> tuple[DataFrame, Series]:
+    data = data.copy()
+    # filter only relevant activities count occurrences for each ObjectID
+    logger.info('Filtering activities per ObjectID')
+    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
+    data_filter_activities = data.loc[filt_rel_activities].copy()
+    num_activities_per_obj_id = cast(
+        Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
+    )
+    # filter for ObjectIDs with more than given number of activities
+    filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
+    # index of series contains ObjectIDs
+    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
+    filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
+        obj_ids_below_thresh
+    )
+
+    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
+    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
+    logger.info('Activities per ObjectID filtered successfully')
+
+    return data_filter_activities, num_activities_per_obj_id

-    return data

 # for each obj_id in relevant_obj_ids
 ## filter data for obj_id
@ -130,6 +128,7 @@ def generate_model_input(
 ## obtain idx pairs, yield
 ## use idx pairs to get idx values of series

+
 def get_timeline_candidates_index(
    data: DataFrame,
    num_activities_per_obj_id: Series,
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
    model_input_feature: str = 'nlp_model_input',
 ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
    # already sorted ObjIDs (descending regarding number of activities)
-    obj_ids = cast(Iterable[ObjectID],
-                   num_activities_per_obj_id.index)
+    obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)

    for obj_id in tqdm(obj_ids):
-        data_per_obj_id = cast(
-            DataFrame,
-            data.loc[data[feature_obj_id]==obj_id]
-        )
+        data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
        data_model_input = data_per_obj_id[model_input_feature]

        candidates_idx = candidates_by_index(
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
            cos_sim_threshold=cos_sim_threshold,
        )
        # directly process candidates
-        candidates_idx = tuple(candidates_idx)
+        # candidates_idx = tuple(candidates_idx)
        similar_id_graph, _ = similar_index_connection_graph(
            similar_idx_pairs=candidates_idx,
        )
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
        for index_group in similar_index_groups(similar_id_graph):
            yield obj_id, index_group

+
 # TODO: check application for duplicate removal
-def candidates_by_index(
-    data_model_input: Series,
-    model: SentenceTransformer,
-    cos_sim_threshold: float = 0.5,
-) -> Iterator[tuple[PandasIndex, PandasIndex]]:
-    """function to filter candidate indices based on cosine similarity
-    using SentenceTransformer model in batch mode,
-    feed data as Series to retain information about indices of entries and
-    access them later in the original dataset
-
-    Parameters
-    ----------
-    obj_id : ObjectID
-        _description_
-    data_model_input : Series
-        containing indices and text entries to process
-    model : SentenceTransformer
-        necessary SentenceTransformer model to encode text entries
-    cos_sim_threshold : float, optional
-        threshold for cosine similarity to filter candidates, by default 0.5
-
-    Yields
-    ------
-    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
-        ObjectID and tuple of index pairs which meet the cosine 
-        similarity threshold
-    """
-    # embeddings
-    batch = cast(list[str],
-                 data_model_input.to_list())
-    embds = cast(
-        Tensor,
-        model.encode(
-            batch, 
-            convert_to_numpy=False,
-            convert_to_tensor=True,
-            show_progress_bar=False,
-        )
-    )
-    # cosine similarity
-    cos_sim = cast(
-        npt.NDArray,
-        sentence_transformers.util.cos_sim(embds, embds).numpy()
-    )
-    np.fill_diagonal(cos_sim, 0.)
-    cos_sim = np.triu(cos_sim)
-    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
-    
-    for idx_array in cos_sim_idx:
-        idx_pair = cast(
-            tuple[np.int64, np.int64],
-            tuple(data_model_input.index[idx] for idx in idx_array)
-        )
-        yield idx_pair
-
-
 def transform_timeline_candidates(
    candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
 ) -> TimelineCandidates:
@ -259,20 +199,52 @@ def transform_timeline_candidates(

    return candidates_by_obj_id

-def map_obj_texts(
+
+def map_obj_id_to_texts(
    data: DataFrame,
-    obj_ids: Iterable[ObjectID],
+    feature_obj_id: str = 'ObjektID',
 ) -> dict[ObjectID, str]:
+    data = data.copy()
+    obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
+
    obj_id_to_text: dict[ObjectID, str] = {}

-    for obj_id in obj_ids:
-        data_per_obj = cast(
-            DataFrame,
-            data.loc[data['ObjektID']==obj_id]
-        )
+    for obj_id in tqdm(obj_ids):
+        data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
        # just take first entry
        obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
        obj_text = obj_text.strip(r' ,.:')
        obj_id_to_text[obj_id] = obj_text

    return obj_id_to_text
+
+
+def get_timeline_candidates(
+    data: DataFrame,
+    num_activities_per_obj_id: Series,
+    *,
+    model: SentenceTransformer,
+    cos_sim_threshold: float,
+    feature_obj_id: str = 'ObjektID',
+    model_input_feature: str = 'nlp_model_input',
+) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
+    logger.info('Obtaining timeline candidates...')
+    candidates = get_timeline_candidates_index(
+        data=data,
+        num_activities_per_obj_id=num_activities_per_obj_id,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+        feature_obj_id=feature_obj_id,
+        model_input_feature=model_input_feature,
+    )
+    tl_candidates = transform_timeline_candidates(candidates)
+    logger.info('Timeline candidates obtained successfully.')
+    # text mapping to obtain object descriptors
+    logger.info('Mapping ObjectIDs to their respective text descriptor...')
+    map_obj_text = map_obj_id_to_texts(
+        data=data,
+        feature_obj_id=feature_obj_id,
+    )
+    logger.info('ObjectIDs successfully mapped to text descriptors.')
+
+    return tl_candidates, map_obj_text
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@ -1,21 +1,20 @@
-from typing import cast
 import re
-from itertools import combinations
 from collections.abc import Iterator
+from itertools import combinations
+from typing import cast

 from dateutil.parser import parse
-from spacy.tokens.token import Token as SpacyToken
-from spacy.tokens.doc import Doc as SpacyDoc
-from spacy.lang.de import German as GermanSpacyModel
 from pandas import DataFrame
+from spacy.lang.de import German as GermanSpacyModel
+from spacy.tokens.doc import Doc as SpacyDoc
+from spacy.tokens.token import Token as SpacyToken
 from tqdm.auto import tqdm

-from lang_main.loggers import logger_token_analysis as logger
 from lang_main.analysis.graphs import (
-    update_graph,
    TokenGraph,
+    update_graph,
 )
-
+from lang_main.loggers import logger_token_analysis as logger

 # ** Logging
 # LOGGING_LEVEL = 'INFO'
@ -38,13 +37,14 @@ TAG_OF_INTEREST: frozenset[str] = frozenset()

 # ** obtaining connection in texts

-def pre_clean_word(string: str) -> str:

+def pre_clean_word(string: str) -> str:
    pattern = r'[^A-Za-zäöüÄÖÜ]+'
    string = re.sub(pattern, '', string)

    return string

+
 # https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
 def is_str_date(
    string: str,
@ -67,10 +67,10 @@ def is_str_date(
    except ValueError:
        return False

+
 def obtain_relevant_descendants(
    token: SpacyToken,
 ) -> Iterator[SpacyToken]:
-    
    for descendant in token.subtree:
        # subtrees contain the token itself
        # if current element is token skip this element
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
        if is_str_date(string=descendant.text):
            continue

-        logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
-                      f">>{descendant}<<, POS >>{descendant.pos_}<<"))
+        logger.debug(
+            (
+                f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
+                f'>>{descendant}<<, POS >>{descendant.pos_}<<'
+            )
+        )

        # eliminate cases of cross-references with verbs
-        if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
-            (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
+        if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
+            descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
+        ):
            continue
        # skip cases in which descendant is indirect POS with others than verbs
        elif descendant.pos_ in POS_INDIRECT:
@ -99,6 +104,7 @@ def obtain_relevant_descendants(

        # TODO look at results and fine-tune function accordingly

+
 def add_doc_info_to_graph(
    graph: TokenGraph,
    doc: SpacyDoc,
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
                        graph=graph,
                        parent=token.lemma_,
                        child=descendant.lemma_,
-                        weight_connection=weight
+                        weight_connection=weight,
                    )
            else:
                # if indirect POS, make connection between all associated words
@ -139,6 +145,7 @@ def add_doc_info_to_graph(
                        weight_connection=weight,
                    )

+
 def build_token_graph(
    data: DataFrame,
    model: GermanSpacyModel,
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -0,0 +1,55 @@
+from pathlib import Path
+from typing import Final
+
+from lang_main import CONFIG
+
+# ** paths
+INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
+SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
+PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
+# ** control
+DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
+SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
+DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
+SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
+DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
+SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
+DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
+SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
+# ** export
+
+# ** preprocessing
+FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
+    'filename_cossim_filter_candidates'
+]
+DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
+THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
+    'threshold_amount_characters'
+]
+THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
+# ** token analysis
+
+# ** graph postprocessing
+THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
+# ** time analysis.uniqueness
+THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
+    'threshold_unique_texts'
+]
+UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
+    'criterion_feature'
+]
+FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
+# ** time_analysis.model_input
+MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
+    CONFIG['time_analysis']['model_input']['input_features']
+)
+ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
+ACTIVITY_TYPES: Final[tuple[str]] = tuple(
+    CONFIG['time_analysis']['model_input']['activity_types']
+)
+THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
+    'threshold_num_acitivities'
+]
+THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
+    'threshold_similarity'
+]
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -0,0 +1,56 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+results = './results/test_new2/'
+dataset = './01_2_Rohdaten_neu/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = false
+graph_postprocessing = false
+graph_postprocessing_skip = false
+time_analysis = false
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+input_features = [
+    'VorgangsTypName',
+    'VorgangsArtText',
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -1,5 +1,5 @@
-from typing import Final
 import logging
+from typing import Final

 from lang_main.types import LoggingLevels

--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -1,20 +1,18 @@
-from typing import Any
-#from types import FunctionType
-import sys
-import logging
 from collections.abc import Callable
 from pathlib import Path
+from typing import Any

 from lang_main.loggers import logger_pipelines as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle

 # ** pipelines to perform given actions on dataset in a customisable manner

+
 class NoPerformableActionError(Exception):
    """Error describing that no action is available in the current pipeline"""

-class BasePipeline():

+class BasePipeline:
    def __init__(
        self,
        name: str,
@ -27,6 +25,8 @@ class BasePipeline():
        self.name = name
        # working directory for pipeline == output path
        self.working_dir = working_dir
+        # if not self.working_dir.exists():
+        #     self.working_dir.mkdir(parents=True)

        # container for actions to perform during pass
        self.actions: list[Callable] = []
@ -39,8 +39,10 @@ class BasePipeline():
        self._intermediate_result: Any | None = None

    def __repr__(self) -> str:
-        return (f"{self.__class__.__name__}(name: {self.name}, "
-                f"working dir: {self.working_dir}, contents: {self.action_names})")
+        return (
+            f'{self.__class__.__name__}(name: {self.name}, '
+            f'working dir: {self.working_dir}, contents: {self.action_names})'
+        )

    @property
    def intermediate_result(self) -> Any:
@ -60,8 +62,9 @@ class BasePipeline():
            self.actions_kwargs.append(action_kwargs.copy())
            self.is_save_result.append(save_result)
        else:
-            raise TypeError(("Action must be custom function, "
-                             f"but is of type >>{type(action)}<<."))
+            raise TypeError(
+                f'Action must be custom function, but is of type >>{type(action)}<<.'
+            )

    # TODO: add multiple entries by utilising simple add method
    """
@ -107,13 +110,14 @@ class BasePipeline():
        return data

    def prep_run(self) -> None:
-        logger.info(f"Starting processing pipeline >>{self.name}<<...")
+        logger.info(f'Starting processing pipeline >>{self.name}<<...')
        # progress tracking
        self.curr_proc_idx = 1
        # check if performable actions available
        if len(self.actions) == 0:
-            raise NoPerformableActionError(("The pipeline does not contain any "
-                                           "performable actions."))
+            raise NoPerformableActionError(
+                ('The pipeline does not contain any ' 'performable actions.')
+            )

    def run(
        self,
@ -135,6 +139,6 @@ class BasePipeline():
            # processing tracking
            self.curr_proc_idx += 1

-        logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
+        logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')

        return ret
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -1,57 +1,144 @@
-from sentence_transformers import SentenceTransformer
 import spacy
+from sentence_transformers import SentenceTransformer

-from lang_main import (
-    SAVE_PATH_FOLDER,
-    DATE_COLS,
-    FILENAME_COSSIM_FILTER_CANDIDATES,
-    THRESHOLD_SIMILARITY,
-)
-from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.preprocessing import (
-    load_raw_data,
-    remove_duplicates,
-    remove_NA,
+    analyse_feature,
    clean_string_slim,
    entry_wise_cleansing,
-    analyse_feature,
-    build_cosSim_matrix,
-    filt_thresh_cosSim_matrix,
-    list_cosSim_dupl_candidates,
+    load_raw_data,
    merge_similarity_dupl,
+    remove_duplicates,
+    remove_NA,
+)
+from lang_main.analysis.timeline import (
+    filter_activities_per_obj_id,
+    generate_model_input,
+    get_timeline_candidates,
+    remove_non_relevant_obj_ids,
 )
 from lang_main.analysis.tokens import build_token_graph
+from lang_main.constants import (
+    ACTIVITY_FEATURE,
+    ACTIVITY_TYPES,
+    DATE_COLS,
+    FEATURE_NAME_OBJ_ID,
+    MODEL_INPUT_FEATURES,
+    SAVE_PATH_FOLDER,
+    THRESHOLD_NUM_ACTIVITIES,
+    THRESHOLD_SIMILARITY,
+    THRESHOLD_TIMELINE_SIMILARITY,
+    THRESHOLD_UNIQUE_TEXTS,
+    UNIQUE_CRITERION_FEATURE,
+)
+from lang_main.pipelines.base import BasePipeline

 # ** pipeline configuration
 # ** target feature preparation
 pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
-pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
+pipe_target_feat.add(
+    load_raw_data,
+    {
+        'date_cols': DATE_COLS,
+    },
+)
 pipe_target_feat.add(remove_duplicates)
 pipe_target_feat.add(remove_NA, save_result=True)
-pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
-pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
+pipe_target_feat.add(
+    entry_wise_cleansing,
+    {
+        'target_feature': 'VorgangsBeschreibung',
+        'cleansing_func': clean_string_slim,
+    },
+)
+pipe_target_feat.add(
+    analyse_feature,
+    {
+        'target_feature': 'VorgangsBeschreibung',
+    },
+    save_result=True,
+)
 # output: DataFrame containing target feature with
 # number of occurrences and associated ObjectIDs

 # ** embedding pipe
+# ?? still needed?
 # using similarity between entries to catch duplicates with typo or similar content
-pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
+# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
 model_spacy = spacy.load('de_dep_news_trf')
 model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

-pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
-pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
-pipe_embds.add(
-    list_cosSim_dupl_candidates, 
-    {'save_candidates': True, 
-     'saving_path': SAVE_PATH_FOLDER,
-     'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
-     'pipeline': pipe_embds}, save_result=True)
+# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
+# pipe_embds.add(
+#     filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
+# )
+# pipe_embds.add(
+#     list_cosSim_dupl_candidates,
+#     {
+#         'save_candidates': True,
+#         'saving_path': SAVE_PATH_FOLDER,
+#         'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
+#         'pipeline': pipe_embds,
+#     },
+#     save_result=True,
+# )

 # ** Merge duplicates
 pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
-pipe_merge.add(merge_similarity_dupl, save_result=True)
+# pipe_merge.add(merge_similarity_dupl, save_result=True)
+pipe_merge.add(
+    merge_similarity_dupl,
+    {
+        'model': model_stfr,
+        'cos_sim_threshold': THRESHOLD_SIMILARITY,
+    },
+    save_result=True,
+)

 # ** token analysis
 pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
-pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
+pipe_token_analysis.add(
+    build_token_graph,
+    {
+        'model': model_spacy,
+    },
+    save_result=True,
+)
+
+
+# ** timeline analysis
+pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
+pipe_timeline.add(
+    remove_non_relevant_obj_ids,
+    {
+        'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
+        'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+    },
+    save_result=True,
+)
+pipe_timeline.add(
+    generate_model_input,
+    {
+        'target_feature_name': 'nlp_model_input',
+        'model_input_features': MODEL_INPUT_FEATURES,
+    },
+)
+pipe_timeline.add(
+    filter_activities_per_obj_id,
+    {
+        'activity_feature': ACTIVITY_FEATURE,
+        'relevant_activity_types': ACTIVITY_TYPES,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+        'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
+    },
+)
+pipe_timeline.add(
+    get_timeline_candidates,
+    {
+        'model': model_stfr,
+        'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
+        'feature_obj_id': FEATURE_NAME_OBJ_ID,
+        'model_input_feature': 'nlp_model_input',
+    },
+    save_result=True,
+)
--- a/src/lang_main/shared.py
+++ b/src/lang_main/shared.py
@ -1,38 +1,47 @@
-from typing import Any
 import os
-import shutil
 import pickle
+import shutil
 import tomllib
 from pathlib import Path
+from typing import Any

 from lang_main.loggers import logger_shared_helpers as logger

+
 # ** Lib
 def create_saving_folder(
    saving_path_folder: str | Path,
    overwrite_existing: bool = False,
 ) -> None:
    # check for existence of given path
-    if not os.path.exists(saving_path_folder):
-        os.makedirs(saving_path_folder)
+    if isinstance(saving_path_folder, str):
+        saving_path_folder = Path(saving_path_folder)
+    if not saving_path_folder.exists():
+        saving_path_folder.mkdir(parents=True)
    else:
        if overwrite_existing:
            # overwrite if desired (deletes whole path and re-creates it)
            shutil.rmtree(saving_path_folder)
            os.makedirs(saving_path_folder)
        else:
-            logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
-                         "unchanged. If you want to overwrite this path, use parameter "
-                         ">>overwrite_existing<<."))
+            logger.info(
+                (
+                    f'Path >>{saving_path_folder}<< already exists and remained '
+                    f'unchanged. If you want to overwrite this path, use parameter '
+                    f'>>overwrite_existing<<.'
+                )
+            )
+

 def load_toml_config(
    path_to_toml: str | Path,
 ) -> dict[str, Any]:
-    with open(path_to_toml, "rb") as f:
+    with open(path_to_toml, 'rb') as f:
        data = tomllib.load(f)
-    logger.info("Loaded TOML config file successfully.")
+    logger.info('Loaded TOML config file successfully.')
    return data

+
 # saving and loading using pickle
 # careful: pickling from unknown sources can be dangerous
 def save_pickle(
@ -41,16 +50,18 @@ def save_pickle(
 ) -> None:
    with open(path, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
-    logger.info(f"Saved file successfully under {path}")
+    logger.info(f'Saved file successfully under {path}')
+

 def load_pickle(
    path: str | Path,
 ) -> Any:
    with open(path, 'rb') as file:
        obj = pickle.load(file)
-    logger.info("Loaded file successfully.")
+    logger.info('Loaded file successfully.')
    return obj

+
 # TODO: remove, too specialised for common application
 """
 def filter_candidates_idx(
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,4 +1,4 @@
-from typing import TypeAlias, Literal
+from typing import Literal, TypeAlias

 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
--- a/test-notebooks/Preprocess_Pipeline.ipynb
+++ b/test-notebooks/Preprocess_Pipeline.ipynb
@ -13,29 +13,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
   "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'ihm_analyse'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m      3\u001b[0m     load_raw_data,\n\u001b[0;32m      4\u001b[0m     remove_duplicates,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m     merge_similarity_dupl,\n\u001b[0;32m     13\u001b[0m )\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
     ]
    }
   ],
   "source": [
-    "from ihm_analyse import CONFIG\n",
-    "from ihm_analyse.lib.preprocess import (\n",
+    "from lang_main import CONFIG\n",
+    "from lang_main.lib.preprocess import (\n",
    "    load_raw_data,\n",
    "    remove_duplicates,\n",
    "    remove_NA,\n",
@ -47,8 +43,8 @@
    "    list_cosSim_dupl_candidates,\n",
    "    merge_similarity_dupl,\n",
    ")\n",
-    "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
-    "from ihm_analyse.lib.helpers import (\n",
+    "from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
+    "from lang_main.lib.helpers import (\n",
    "    save_pickle, \n",
    "    load_pickle, \n",
    "    create_saving_folder,\n",
--- a/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@ -1,28 +1,42 @@
 from typing import cast
+from pathlib import Path

+import pandas as pd
+import plotly.express as px
 from dash import (
    Dash,
-    html,
-    dcc,
-    callback,
-    Output,
    Input,
+    Output,
    State,
+    callback,
    dash_table,
+    dcc,
+    html,
 )
-import plotly.express as px
-import pandas as pd
-from pandas import DataFrame
-
 from lang_main import load_pickle
-from lang_main.types import TimelineCandidates, ObjectID
+from lang_main.types import ObjectID, TimelineCandidates
+from pandas import DataFrame

 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')

 # ** data
-data = cast(DataFrame, load_pickle('./data.pkl'))
-cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
-texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
+p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
+p_tl = Path(
+    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
+)
+ret = cast(DataFrame, load_pickle(p_df))
+data = ret[0]
+ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
+cands = ret[0]
+texts = ret[1]
+
+# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
+# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
+# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
+# data = cast(DataFrame, load_pickle(p_df))
+# cands = cast(TimelineCandidates, load_pickle(p_cands))
+# texts = cast(dict[ObjectID, str], load_pickle(p_map))
+
 table_feats = [
    'ErstellungsDatum',
    'ErledigungsDatum',
@ -53,23 +67,26 @@ app = Dash(prevent_initial_callbacks=True)

 app.layout = [
    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
-    html.Div(children=[
+    html.Div(
+        children=[
            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
            dcc.Dropdown(
                list(cands.keys()),
                id='dropdown-selection',
-            placeholder="ObjektID auswählen...",
-        )
-    ]),
-    html.Div(children=[
+                placeholder='ObjektID auswählen...',
+            ),
+        ]
+    ),
+    html.Div(
+        children=[
            html.H3(id='object_text'),
            dcc.Dropdown(id='choice-candidates'),
            dcc.Graph(id='graph-output'),
-    ]),
-    html.Div(children=[
-        dash_table.DataTable(id='table-candidates')
-    ]),
        ]
+    ),
+    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
+]
+

@callback(
    Output('object_text', 'children'),
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
    headline = f'HObjektText: {obj_text}'
    return headline

+
@callback(
    Output('choice-candidates', 'options'),
    Input('dropdown-selection', 'value'),
@ -93,6 +111,7 @@ def update_choice_candidates(obj_id):
    choices = list(range(1, len(cands_obj_id) + 1))
    return choices

+
@callback(
    Output('graph-output', 'figure'),
    Input('choice-candidates', 'value'),
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
        title=title,
        hover_data=hover_data,
    )
-    fig.update_traces(
-        mode='markers+lines',
-        marker=markers,
-        marker_symbol='diamond'
-    )
+    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
    fig.update_xaxes(
-        tickformat="%B\n%Y",
+        tickformat='%B\n%Y',
        rangeslider_visible=True,
    )
    fig.update_yaxes(type='category')
-    fig.update_layout(hovermode="x unified")
+    fig.update_layout(hovermode='x unified')
    return fig

+
@callback(
-    [Output('table-candidates', 'data'),
-     Output('table-candidates', 'columns')],
+    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
@ -144,10 +159,10 @@ def update_table_candidates(index, obj_id):
    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()
-    df = (df
-          .filter(items=table_feats, axis=1)
-          .sort_values(by='ErstellungsDatum', ascending=True))
-    cols = [{"name": i, "id": i} for i in df.columns]
+    df = df.filter(items=table_feats, axis=1).sort_values(
+        by='ErstellungsDatum', ascending=True
+    )
+    cols = [{'name': i, 'id': i} for i in df.columns]
    # convert dates to strings
    for col in table_feats_dates:
        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
    table_data = df.to_dict('records')
    return table_data, cols

+
 if __name__ == '__main__':
    app.run(debug=True)
--- a/test-notebooks/dashboard/archive/data.pkl
+++ b/test-notebooks/dashboard/archive/data.pkl
--- a/test-notebooks/dashboard/archive/map_candidates.pkl
+++ b/test-notebooks/dashboard/archive/map_candidates.pkl
--- a/test-notebooks/dashboard/archive/map_texts.pkl
+++ b/test-notebooks/dashboard/archive/map_texts.pkl
--- a/test-notebooks/dashboard/lang_main_config.toml
+++ b/test-notebooks/dashboard/lang_main_config.toml
@ -0,0 +1,56 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+results = './results/test_new2/'
+dataset = './01_2_Rohdaten_neu/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = false
+graph_postprocessing = false
+graph_postprocessing_skip = false
+time_analysis = false
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+input_features = [
+    'VorgangsTypName',
+    'VorgangsArtText',
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
--- a/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/test-notebooks/display_results.ipynb
+++ b/test-notebooks/display_results.ipynb
@ -0,0 +1,663 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from lang_main import load_pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_files(path: str) -> tuple[Path, ...]:\n",
+    "    p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "    assert p.exists(), \"path does not exist\"\n",
+    "    return tuple(p.glob(r'*'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "file = files[-1]\n",
+    "ret = load_pickle(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entry</th>\n",
+       "      <th>len</th>\n",
+       "      <th>num_occur</th>\n",
+       "      <th>assoc_obj_ids</th>\n",
+       "      <th>num_assoc_obj_ids</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>162</th>\n",
+       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
+       "      <td>66</td>\n",
+       "      <td>92592</td>\n",
+       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
+       "      <td>206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
+       "      <td>39</td>\n",
+       "      <td>3108</td>\n",
+       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
+       "      <td>74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>131</th>\n",
+       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
+       "      <td>37</td>\n",
+       "      <td>1619</td>\n",
+       "      <td>[0, 970, 2134, 2137]</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>160</th>\n",
+       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
+       "      <td>36</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>140</th>\n",
+       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
+       "      <td>44</td>\n",
+       "      <td>687</td>\n",
+       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
+       "      <td>166</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2559</th>\n",
+       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
+       "      <td>46</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[211]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2558</th>\n",
+       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
+       "      <td>30</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[93]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2557</th>\n",
+       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
+       "      <td>40</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1707]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2556</th>\n",
+       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
+       "      <td>173</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6782</th>\n",
+       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
+       "      <td>106</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[306, 326]</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4545 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  entry  ... num_assoc_obj_ids\n",
+       "162   Tägliche Wartungstätigkeiten nach Vorgabe des ...  ...               206\n",
+       "33              Wöchentliche Sichtkontrolle / Reinigung  ...                74\n",
+       "131               Tägliche Überprüfung der Ölabscheider  ...                 4\n",
+       "160                Wöchentliche Kontrolle der C-Anlagen  ...                11\n",
+       "140        Halbjährliche Kontrolle des Stabbreithalters  ...               166\n",
+       "...                                                 ...  ...               ...\n",
+       "2559     Fehler 9723 Leistungsversorgung Antrieb defekt  ...                 1\n",
+       "2558                     T-Warp-Let-Off1  schleppfehler  ...                 1\n",
+       "2557           Fahrräder wurden gewartet und gereinigt.  ...                 1\n",
+       "2556  Bohrlöcher an Gebots- und Verbotszeichen anbri...  ...                 1\n",
+       "6782  Befestigung Deckel für Batteriefach defekt    ...  ...                 2\n",
+       "\n",
+       "[4545 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ret[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
+    "files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "file = files[-1]\n",
+    "ret = load_pickle(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>entry</th>\n",
+       "      <th>len</th>\n",
+       "      <th>num_occur</th>\n",
+       "      <th>assoc_obj_ids</th>\n",
+       "      <th>num_assoc_obj_ids</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>162</td>\n",
+       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
+       "      <td>66</td>\n",
+       "      <td>92592</td>\n",
+       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
+       "      <td>206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>33</td>\n",
+       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
+       "      <td>39</td>\n",
+       "      <td>3108</td>\n",
+       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
+       "      <td>74</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>131</td>\n",
+       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
+       "      <td>37</td>\n",
+       "      <td>1619</td>\n",
+       "      <td>[0, 970, 2134, 2137]</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>160</td>\n",
+       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
+       "      <td>36</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>140</td>\n",
+       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
+       "      <td>44</td>\n",
+       "      <td>687</td>\n",
+       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
+       "      <td>166</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6756</th>\n",
+       "      <td>2559</td>\n",
+       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
+       "      <td>46</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[211]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6757</th>\n",
+       "      <td>2558</td>\n",
+       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
+       "      <td>30</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[93]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6758</th>\n",
+       "      <td>2557</td>\n",
+       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
+       "      <td>40</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1707]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6759</th>\n",
+       "      <td>2556</td>\n",
+       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
+       "      <td>173</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[1]</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6760</th>\n",
+       "      <td>6782</td>\n",
+       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
+       "      <td>106</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[306, 326]</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4545 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      index  ... num_assoc_obj_ids\n",
+       "0       162  ...               206\n",
+       "1        33  ...                74\n",
+       "2       131  ...                 4\n",
+       "3       160  ...                11\n",
+       "4       140  ...               166\n",
+       "...     ...  ...               ...\n",
+       "6756   2559  ...                 1\n",
+       "6757   2558  ...                 1\n",
+       "6758   2557  ...                 1\n",
+       "6759   2556  ...                 1\n",
+       "6760   6782  ...                 2\n",
+       "\n",
+       "[4545 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ret[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "cbf0b450-ec00-471f-9627-717e52c5471d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tup = tuple(i for i in range(100000000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6ade9c6f4e61410fb93f35e43222705b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/100000000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "num = 0\n",
+    "for i in tqdm(tup):\n",
+    "    num += i"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4999999950000000"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36366147-3632-4518-936e-878563305e49",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "17156aa0-8fd6-407b-b014-698df0e534a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arr = np.random.rand(1000,1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = np.argwhere(arr >= 0.97)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "30220"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "5b78436e-a828-42bd-a5ed-ae6045349391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = idx[:200]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "75edc50e-b64c-4319-8f74-27653ed3452c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tuple(map(tuple, batch))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tuple(tuple(x) for x in batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}