STRF for similarity duplicates, time analysis pipeline, enhanced config

2024-05-29 16:34:31 +02:00 · 2024-05-29 16:34:31 +02:00 · bb987e2108
commit bb987e2108
parent 5d2c97165a
30 changed files with 1875 additions and 693 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,3 +34,15 @@ trials = [
    "plotly>=5.22.0",
    "dash>=2.17.0",
 ]
 [tool.ruff]
 line-length = 94
 indent-width = 4
 target-version = "py311"
 [tool.ruff.format]
 quote-style = "single"
 skip-magic-trailing-comma = false
 [tool.ruff.lint]
 select = ["E", "F", "I"]
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@ -1,33 +1,43 @@
 import typing
 import warnings
 from pathlib import Path
 from typing import cast
-from pandas import DataFrame, Series
+from lang_main import (
-
+    TokenGraph,
 from ihm_analyse import (
    SAVE_PATH_FOLDER,
    PATH_TO_DATASET,
    THRESHOLD_AMOUNT_CHARACTERS,
    THRESHOLD_EDGE_WEIGHT,
    DO_PREPROCESSING,
    DO_TOKEN_ANALYSIS,
    DO_GRAPH_POSTPROCESSING,
    create_saving_folder,
    load_pickle,
    Embedding,
    Index,
    TokenGraph,
 )
-from ihm_analyse.predefined_pipes import (
+from lang_main.constants import (
-    pipe_target_feat,
+    DO_GRAPH_POSTPROCESSING,
-    pipe_embds,
+    DO_PREPROCESSING,
    DO_TIME_ANALYSIS,
    DO_TOKEN_ANALYSIS,
    INPUT_PATH_FOLDER,
    PATH_TO_DATASET,
    SAVE_PATH_FOLDER,
    SKIP_GRAPH_POSTPROCESSING,
    SKIP_PREPROCESSING,
    SKIP_TIME_ANALYSIS,
    SKIP_TOKEN_ANALYSIS,
    THRESHOLD_AMOUNT_CHARACTERS,
    THRESHOLD_EDGE_WEIGHT,
 )
 # Embedding,
 # PandasIndex,
 from lang_main.pipelines.predefined import (
    pipe_merge,
    pipe_target_feat,
    pipe_timeline,
    pipe_token_analysis,
 )
-"""
+from lang_main.types import (
-# ** config parameters
+    ObjectID,
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
+    TimelineCandidates,
-PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
+)
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
+from pandas import DataFrame, Series
-"""
+
 # ** processing pipeline
 def run_preprocessing() -> DataFrame:
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
        overwrite_existing=True,
    )
    # run pipelines
-    ret = typing.cast(tuple[DataFrame], 
+    ret = typing.cast(
-                      pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
+        tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
    )
    target_feat_data = ret[0]
    # only entries with more than threshold amount of characters
-    data_filter = typing.cast(Series,
+    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-                              (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
+    # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
-    subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
+    # dupl_idx_pairs, embds = typing.cast(
-    dupl_idx_pairs, embds = typing.cast(
+    #     tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
-        tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
+    #     pipe_embds.run(starting_values=(subset_data,)),
-        pipe_embds.run(starting_values=(subset_data,))
+    # )
    )
    # merge duplicates, results saved separately
-    ret = typing.cast(tuple[DataFrame],
+    subset_data = target_feat_data.loc[data_filter].copy()
-                      pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
+    ret = typing.cast(
        tuple[DataFrame],
        # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
        pipe_merge.run(starting_values=(subset_data,)),
    )
    preprocessed_data = ret[0]
    return preprocessed_data
 def run_token_analysis(
    preprocessed_data: DataFrame,
 ) -> TokenGraph:
    # build token graph
-    (tk_graph,) = typing.cast(tuple[TokenGraph],
+    (tk_graph,) = typing.cast(
-                                 pipe_token_analysis.run(starting_values=(preprocessed_data,)))
+        tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
    )
    tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
-    tk_graph.to_pickle(SAVE_PATH_FOLDER, 
+    tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
                     filename=f'{pipe_token_analysis.name}-TokenGraph')
    return tk_graph
 def run_graph_postprocessing(
    tk_graph: TokenGraph,
 ) -> TokenGraph:
    # filter graph by edge weight and remove single nodes (no connection)
    tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
    tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
-    tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
+    tk_graph_filtered.save_graph(
-                                 filename='TokenGraph-filtered',
+        SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
-                                 directed=False)
+    )
-    tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
+    tk_graph_filtered.to_pickle(
-                     filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
+        SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
    )
    return tk_graph_filtered
-if __name__ == '__main__':
+
 def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
    filename = 'without_nan'
    loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
    verify_path(loading_path)
    ret = load_pickle(loading_path)
    preprocessed_data = ret[0]
    ret = cast(
        tuple[TimelineCandidates, dict[ObjectID, str]],
        pipe_timeline.run(starting_values=(preprocessed_data,)),
    )
    return ret
 def verify_path(
    loading_path: Path,
 ) -> None:
    if not loading_path.exists():
        raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
 def main() -> None:
    pre_step_skipped: bool = False
    # ** preprocess
-    if DO_PREPROCESSING:
+    if DO_PREPROCESSING and not SKIP_PREPROCESSING:
        preprocessed_data = run_preprocessing()
-    else:
+    elif not SKIP_PREPROCESSING:
        # !! hardcoded result filenames
        target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
-        target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
+        loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
-        ret = typing.cast(tuple[DataFrame],
+        verify_path(loading_path)
-                          load_pickle(target_filepath))
+        ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
        preprocessed_data = ret[0]
    # ** token analysis
    if DO_TOKEN_ANALYSIS:
        preprocessed_data_trunc = typing.cast(DataFrame, 
                                            preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
        tk_graph = run_token_analysis(preprocessed_data_trunc)
    else:
        pre_step_skipped = True
        warnings.warn('No preprocessing action selected. Skipped.')
    # sys.exit(0)
    # ** token analysis
    if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
        if pre_step_skipped:
            raise RuntimeError(
                'Preprocessing step skipped. Token analysis cannot be performed.'
            )
        preprocessed_data_trunc = typing.cast(
            DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
        )  # type: ignore
        tk_graph = run_token_analysis(preprocessed_data_trunc)
    elif not SKIP_TOKEN_ANALYSIS:
        # !! hardcoded result filenames
        # whole graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
        verify_path(loading_path)
        # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph = TokenGraph.from_pickle(loading_path)
-    # ** graph postprocessing
+        pre_step_skipped = False
    if DO_GRAPH_POSTPROCESSING:
        tk_graph_filtered = run_graph_postprocessing(tk_graph)
    else:
        pre_step_skipped = True
        warnings.warn('No token analysis action selected. Skipped.')
    # ** graph postprocessing
    if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
        if pre_step_skipped:
            raise RuntimeError(
                (
                    'Preprocessing or token analysis step skipped. '
                    'Graph postprocessing cannot be performed.'
                )
            )
        tk_graph_filtered = run_graph_postprocessing(tk_graph)
    elif not SKIP_GRAPH_POSTPROCESSING:
        # !! hardcoded result filenames
        # filtered graph
        filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
-        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
+        loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
        verify_path(loading_path)
        # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
        tk_graph_filtered = TokenGraph.from_pickle(loading_path)
        pre_step_skipped = False
    else:
        warnings.warn('No graph postprocessing action selected. Skipped.')
    # ** time analysis
    if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
        # no check for fails, runs separately
        ret = run_time_analysis()
    elif not SKIP_TIME_ANALYSIS:
        ...
    else:
        warnings.warn('No time analysis action selected. Skipped.')
 if __name__ == '__main__':
    main()
--- a/scripts/inputs/without_nan.pkl
+++ b/scripts/inputs/without_nan.pkl
--- a/src/lang_main/config.toml
+++ b/src/lang_main/config.toml
@ -1,17 +1,21 @@
 # lang_main: Config file
 [paths]
-results = './results/test_new2/'
+inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
-dataset = './01_2_Rohdaten_neu/Export4.csv'
+results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
 dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
-preprocessing = false
+preprocessing = true
-token_analysis = true
+preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = true
 graph_postprocessing = false
 graph_postprocessing_skip = true
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
--- a/scripts/lang_main_config.toml
+++ b/scripts/lang_main_config.toml
@ -0,0 +1,59 @@
 # lang_main: Config file
 [paths]
 inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
 results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
 dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
 preprocessing = true
 preprocessing_skip = true
 token_analysis = false
 token_analysis_skip = true
 graph_postprocessing = false
 graph_postprocessing_skip = true
 time_analysis = true
 time_analysis_skip = false
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum", 
    "ErledigungsDatum", 
    "Arbeitsbeginn", 
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.model_input]
 # input_features = [
 #     'VorgangsTypName',
 #     'VorgangsArtText',
 #     'VorgangsBeschreibung',
 # ]
 input_features = [
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/scripts/test.py
+++ b/scripts/test.py
@ -0,0 +1,12 @@
 from lang_main.analysis.preprocessing import clean_string_slim
 from lang_main.constants import SAVE_PATH_FOLDER
 print(SAVE_PATH_FOLDER)
 txt = """
 Wir feiern den Jahrestag, olé!
 tel:::: !!!!???? +++49 123 456 789
 Doch leben wir länger.
 """
 print(txt)
 print(clean_string_slim(txt))
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -1,18 +1,19 @@
 from typing import Final, Any
 import inspect
 import sys
 import logging
-from time import gmtime
+import shutil
 import sys
 from pathlib import Path
 from time import gmtime
 from typing import Any, Final
 from lang_main.shared import (
    save_pickle, 
    load_pickle, 
    create_saving_folder,
    load_toml_config,
 )
 from lang_main.analysis.preprocessing import Embedding, PandasIndex
 from lang_main.analysis.graphs import TokenGraph
 from lang_main.analysis.preprocessing import Embedding, PandasIndex
 from lang_main.shared import (
    create_saving_folder,
    load_pickle,
    load_toml_config,
    save_pickle,
 )
 __all__ = [
    'save_pickle',
@ -32,37 +33,30 @@ logging.basicConfig(
    datefmt=LOG_DATE_FMT,
 )
-USE_INTERNAL_CONFIG: Final[bool] = True
+CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 USE_INTERNAL_CONFIG: Final[bool] = False
 pkg_dir = Path(__file__).parent
 cfg_path_internal = pkg_dir / CONFIG_FILENAME
 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
-    curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
+    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
    pkg_dir = curr_file_dir.parent
    config_path = Path(pkg_dir, 'config.toml')
    loaded_config = load_toml_config(path_to_toml=config_path)
    CONFIG: Final[dict[str, Any]] = loaded_config.copy()
 else:
-    raise NotImplementedError("External config data not implemented yet.")
+    caller_file = Path(inspect.stack()[-1].filename)
    if not caller_file.exists():
        raise FileNotFoundError('Caller file could not be correctly retrieved.')
    cfg_path_external = caller_file.parent / CONFIG_FILENAME
    if not cfg_path_external.exists():
        shutil.copy(cfg_path_internal, cfg_path_external)
        sys.exit(
            (
                'No config file was found. A new one with default values was created '
                'in the execution path. Please fill in the necessary values and '
                'restart the programm.'
            )
        )
    # raise NotImplementedError("External config data not implemented yet.")
    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
-# ** paths
+CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
 SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
 PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
 # ** control
 DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
 DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
 DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
 # ** export
 # ** preprocessing
 FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
    CONFIG['preprocess']['filename_cossim_filter_candidates']
 DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
 THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
    CONFIG['preprocess']['threshold_amount_characters']
 THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
 # ** token analysis
 # ** graph postprocessing
 THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
 # ** time analysis
 THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -1,18 +1,18 @@
 import typing
 from typing import Any, Self, Literal, overload, Final
 import sys
 from collections.abc import Hashable
 from pathlib import Path
 import copy
 import sys
 import typing
 from collections.abc import Hashable, Iterable
 from pathlib import Path
 from typing import Any, Final, Literal, Self, overload
 import networkx as nx
 import numpy as np
 import numpy.typing as npt
-from networkx import Graph, DiGraph
+from networkx import DiGraph, Graph
 import networkx as nx
 from pandas import DataFrame
 from lang_main.loggers import logger_graphs as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle
 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False
@ -31,8 +31,7 @@ def get_graph_metadata(
    min_edge_weight: int = 1_000_000
    max_edge_weight: int = 0
    for edge in graph.edges:
-        weight = typing.cast(int,
+        weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
                             graph[edge[0]][edge[1]]['weight'])
        if weight < min_edge_weight:
            min_edge_weight = weight
        if weight > max_edge_weight:
@ -54,18 +53,20 @@ def get_graph_metadata(
    )
    if logging:
-        logger.info((f"Graph properties: {num_nodes} Nodes, "
+        logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
-                    f"{num_edges} Edges"))
+        logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
-        logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
+        logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
-        logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
+        logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
        logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
    return graph_info
 def update_graph(
    graph: Graph | DiGraph,
-    parent: Hashable,
+    *,
-    child: Hashable,
+    batch: Iterable[tuple[Hashable, Hashable]] | None = None,
    parent: Hashable | None = None,
    child: Hashable | None = None,
    weight_connection: int = 1,
 ) -> None:
    # !! not necessary to check for existence of nodes
@ -78,7 +79,9 @@ def update_graph(
        graph.add_node(child)
    """
    # check if edge not in Graph
-    if not graph.has_edge(parent, child):
+    if batch is not None:
        graph.add_edges_from(batch, weight=weight_connection)
    elif not graph.has_edge(parent, child):
        # create new edge, nodes will be created if not already present
        graph.add_edge(parent, child, weight=weight_connection)
    else:
@ -87,16 +90,15 @@ def update_graph(
        weight += weight_connection
        graph[parent][child]['weight'] = weight
 # build undirected adjacency matrix
 def convert_graph_to_undirected(
    graph: DiGraph,
    logging: bool = LOGGING_DEFAULT,
 ) -> Graph:
    # get adjacency matrix
-    adj_mat = typing.cast(DataFrame, 
+    adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
-                          nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
+    arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
    arr = typing.cast(npt.NDArray[np.uint32],
                      adj_mat.to_numpy())
    # build undirected array: adding edges of lower triangular matrix to upper one
    arr_upper = np.triu(arr)
    arr_lower = np.tril(arr)
@ -104,18 +106,17 @@ def convert_graph_to_undirected(
    arr_new = arr_upper + arr_lower
    # assign new data and create graph
    adj_mat.loc[:] = arr_new  # type: ignore
-    graph_undir = typing.cast(Graph,
+    graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
                              nx.from_pandas_adjacency(df=adj_mat))
    # info about graph
    if logging:
-        logger.info("Successfully converted graph to one with undirected edges.")
+        logger.info('Successfully converted graph to one with undirected edges.')
    _ = get_graph_metadata(graph=graph_undir, logging=logging)
    return graph_undir
 class TokenGraph(DiGraph):
 class TokenGraph(DiGraph):
    def __init__(
        self,
        name: str = 'TokenGraph',
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
        return self.__str__()
    def __str__(self) -> str:
-        return (f"TokenGraph(name: {self.name}, number of nodes: "
+        return (
-                f"{len(self.nodes)}, number of edges: "
+            f'TokenGraph(name: {self.name}, number of nodes: '
-                f"{len(self.edges)})")
+            f'{len(self.nodes)}, number of edges: '
            f'{len(self.edges)})'
        )
    # !! only used to verify that saving was done correctly
    """
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
        self,
        inplace: Literal[True] = ...,
        logging: bool | None = ...,
-    ) -> None:
+    ) -> None: ...
        ...
    @overload
    def to_undirected(
        self,
        inplace: Literal[False],
        logging: bool | None = ...,
-    ) -> Graph:
+    ) -> Graph: ...
        ...
    @overload
    def to_undirected(
-        self, 
+        self, inplace: bool = ..., logging: bool | None = ...
-        inplace: bool = ..., 
+    ) -> Graph | None: ...
        logging: bool | None = ...
    ) -> Graph | None:
        ...
    def to_undirected(
        self,
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
        if logging is None:
            logging = self.logging
-        self._undirected = convert_graph_to_undirected(graph=self, 
+        self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
-                                                       logging=logging)
+        self._metadata_undirected = get_graph_metadata(
-        self._metadata_undirected = get_graph_metadata(graph=self._undirected,
+            graph=self._undirected, logging=logging
-                                                       logging=logging)
+        )
        if not inplace:
            return self._undirected
@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
        if logging is None:
            logging = self.logging
-        self._metadata_directed = get_graph_metadata(graph=self,
+        self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
                                                     logging=logging)
        if self._undirected is not None:
-            self._metadata_undirected = get_graph_metadata(graph=self._undirected,
+            self._metadata_undirected = get_graph_metadata(
-                                                           logging=logging)
+                graph=self._undirected, logging=logging
            )
    def filter_by_edge_weight(
        self,
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
        filtered_graph = self.copy()
        for edge in original_graph_edges:
-            weight = typing.cast(int,
+            weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
                                 filtered_graph[edge[0]][edge[1]]['weight'])
            if weight < threshold:
                filtered_graph.remove_edge(edge[0], edge[1])
@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
        filename: str | None = None,
    ) -> Path:
        if filename is not None:
-            saving_path = path.joinpath(f"{filename}")
+            saving_path = path.joinpath(f'{filename}')
        else:
-            saving_path = path.joinpath(f"{self.name}")
+            saving_path = path.joinpath(f'{self.name}')
        return saving_path
@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
        elif not directed and self._undirected is not None:
            target_graph = self._undirected
        else:
-            raise ValueError("No undirected graph available.")
+            raise ValueError('No undirected graph available.')
        saving_path = saving_path.with_suffix('.graphml')
        nx.write_graphml(G=target_graph, path=saving_path)
-        logger.info(("Successfully saved graph as GraphML file "
+        logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
                        f"under {saving_path}."))
    def to_pickle(
        self,
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
        match path.suffix:
            case '.graphml':
                graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
-                logger.info(f"Successfully loaded graph from GraphML file {path}.")
+                logger.info(f'Successfully loaded graph from GraphML file {path}.')
            case '.pkl' | '.pickle':
                graph = typing.cast(Self, load_pickle(path))
-                logger.info(f"Successfully loaded graph from pickle file {path}.")
+                logger.info(f'Successfully loaded graph from pickle file {path}.')
            case _:
-                raise ValueError("File format not supported.")
+                raise ValueError('File format not supported.')
        return graph
@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
            path = Path(path)
        if path.suffix not in ('.pkl', '.pickle'):
-            raise ValueError("File format not supported.")
+            raise ValueError('File format not supported.')
        graph = typing.cast(Self, load_pickle(path))
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,29 +1,29 @@
-from typing import cast, Callable
+import re
 from collections.abc import Iterable
 from itertools import combinations
 import re
 from math import factorial
 from pathlib import Path
 from typing import Callable, cast
 import numpy as np
 from torch import Tensor
 from pandas import DataFrame, Series
 import pandas as pd
 from spacy.lang.de import German as GermanSpacyModel
 from spacy.tokens.doc import Doc as SpacyDoc
 from sentence_transformers import SentenceTransformer
 import sentence_transformers
 import sentence_transformers.util
 from pandas import DataFrame, Series
 from sentence_transformers import SentenceTransformer
 from spacy.lang.de import German as GermanSpacyModel
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor
 from tqdm import tqdm
 from lang_main.types import Embedding, PandasIndex
 from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.shared import (
    candidates_by_index,
    similar_index_connection_graph,
    similar_index_groups,
 )
-#from lang_main.analysis.graphs import update_graph, get_graph_metadata
+from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import BasePipeline
 from lang_main.types import Embedding, PandasIndex
 # ** (1) dataset preparation: loading and simple preprocessing
@ -67,11 +67,16 @@ def load_raw_data(
        parse_dates=date_cols,
        dayfirst=True,
    )
-    logger.info("Loaded dataset successfully.")
+    logger.info('Loaded dataset successfully.')
-    logger.info((f"Dataset properties: number of entries: {len(data)}, "
+    logger.info(
-                 f"number of features {len(data.columns)}"))
+        (
            f'Dataset properties: number of entries: {len(data)}, '
            f'number of features {len(data.columns)}'
        )
    )
    return (data,)
 def remove_duplicates(
    data: DataFrame,
 ) -> tuple[DataFrame]:
@ -89,7 +94,7 @@ def remove_duplicates(
    """
    # obtain info about duplicates over all features
    duplicates_filt = data.duplicated()
-    logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
+    logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
    # drop duplicates
    wo_duplicates = data.drop_duplicates(ignore_index=True)
    duplicates_subset: list[str] = [
@ -97,16 +102,26 @@ def remove_duplicates(
        'ObjektID',
    ]
    duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
-    logger.info(("Number of duplicates over subset " 
+    logger.info(
-                 f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
+        (
-    wo_duplicates =\
+            'Number of duplicates over subset '
-        wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
+            f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
-    logger.info("Removed all duplicates from dataset successfully.")
+        )
-    logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
+    )
-                 f"number of features {len(wo_duplicates.columns)}"))
+    wo_duplicates = wo_duplicates.drop_duplicates(
        subset=duplicates_subset, ignore_index=True
    ).copy()
    logger.info('Removed all duplicates from dataset successfully.')
    logger.info(
        (
            f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
            f'number of features {len(wo_duplicates.columns)}'
        )
    )
    return (wo_duplicates,)
 def remove_NA(
    data: DataFrame,
    target_features: list[str] = [
@ -128,15 +143,16 @@ def remove_NA(
        dataset with removed NA entries for given subset of features
    """
    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
-    logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
+    logger.info(
        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
    )
    return (wo_NA,)
 # ** (2) entry-based cleansing
 # following functions clean and prepare specific entries, not whole dataset
-def clean_string_slim(
+def clean_string_slim(string: str) -> str:
    string: str
 ) -> str:
    """mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features
@ -151,13 +167,16 @@ def clean_string_slim(
        cleaned entry
    """
    # remove special chars
-    pattern = r'[\t\n\r\f\v]'
+    pattern = r'[\t\n\r\f\v]+'
    string = re.sub(pattern, ' ', string)
    pattern = r'([,;.:!?-_\+]){2,}'
    # remove whitespaces at the beginning and the end
    string = re.sub(pattern, r'\1', string)
    string = string.strip()
    return string
 def entry_wise_cleansing(
    data: DataFrame,
    target_feature: str,
@ -165,10 +184,16 @@ def entry_wise_cleansing(
 ) -> tuple[DataFrame]:
    # apply given cleansing function to target feature
    data[target_feature] = data[target_feature].map(cleansing_func)
-    logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
+    logger.info(
-                 f"for feature >>{target_feature}<<"))
+        (
            f'Successfully applied entry-wise cleansing procedure '
            f'>>{cleansing_func.__name__}<< '
            f'for feature >>{target_feature}<<'
        )
    )
    return (data,)
 # ** in-depth analysis of one feature
 # following functions try to gain insights on a given feature of the IHM dataset such
 # as number of occurrences or associated Object IDs
@ -178,7 +203,7 @@ def analyse_feature(
 ) -> tuple[DataFrame]:
    # feature columns
    feature_entries = data[target_feature]
-    logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
+    logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
    # obtain unique entries
    unique_feature_entries = feature_entries.unique()
@ -186,7 +211,7 @@ def analyse_feature(
    cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
    result_df = pd.DataFrame(columns=cols)
-    for entry in tqdm(unique_feature_entries, mininterval=1.):
+    for entry in tqdm(unique_feature_entries, mininterval=1.0):
        len_entry = len(entry)
        filt = data[target_feature] == entry
        temp = data[filt]
@ -195,13 +220,10 @@ def analyse_feature(
        num_assoc_obj_ids = len(assoc_obj_ids)
        num_dupl = filt.sum()
-        conc_df = pd.DataFrame(data=[[
+        conc_df = pd.DataFrame(
-                                entry,
+            data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
-                                len_entry,
+            columns=cols,
-                                num_dupl,
+        )
                                assoc_obj_ids,
                                num_assoc_obj_ids
                            ]], columns=cols)
        result_df = pd.concat([result_df, conc_df], ignore_index=True)
@ -230,9 +252,9 @@ def build_embedding_map(
        is_STRF = True
    if not any((is_spacy, is_STRF)):
-        raise NotImplementedError("Model type unknown")
+        raise NotImplementedError('Model type unknown')
-    for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
+    for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
        # verbose code: Pyright not inferring types correctly
        idx = cast(int, idx)
        text = cast(str, text)
@ -246,12 +268,17 @@ def build_embedding_map(
                logger.debug(f'{embd.text=} has no vector')
        elif is_STRF:
            model = cast(SentenceTransformer, model)
-            embd = cast(Tensor, 
+            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
                        model.encode(text, show_progress_bar=False))
            embeddings[idx] = (embd, text)
    return embeddings, (is_spacy, is_STRF)
 # adapt interface
 # use candidates by index function
 # merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
 # build similarity matrix out of embeddings
 def build_cosSim_matrix(
    data: Series,
@ -259,10 +286,11 @@ def build_cosSim_matrix(
 ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
    # build empty matrix
    df_index = data.index
-    cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index, 
+    cosineSim_idx_matrix = pd.DataFrame(
-                                    index=df_index, dtype=np.float32)
+        data=0.0, columns=df_index, index=df_index, dtype=np.float32
    )
-    logger.info("Start building embedding map...")
+    logger.info('Start building embedding map...')
    # obtain embeddings based on used model
    embds, (is_spacy, is_STRF) = build_embedding_map(
@ -270,15 +298,15 @@ def build_cosSim_matrix(
        model=model,
    )
-    logger.info("Embedding map built successfully.")
+    logger.info('Embedding map built successfully.')
    # apply index based mapping for efficient handling of large texts
    combs = combinations(df_index, 2)
    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
-    logger.info("Start calculation of similarity scores...")
+    logger.info('Start calculation of similarity scores...')
-    for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
+    for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
        # print(f"{idx1=}, {idx2=}")
        embd1 = embds[idx1][0]
        embd2 = embds[idx2][0]
@ -296,10 +324,11 @@ def build_cosSim_matrix(
        cosineSim_idx_matrix.at[idx1, idx2] = cosSim
-    logger.info("Similarity scores calculated successfully.")
+    logger.info('Similarity scores calculated successfully.')
    return cosineSim_idx_matrix, embds
 # obtain index pairs with cosine similarity
 # greater than or equal to given threshold value
 def filt_thresh_cosSim_matrix(
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
    Series
        series with multi index (index pairs) and corresponding similarity score
    """
-    cosineSim_filt = cast(Series, 
+    cosineSim_filt = cast(
-                          cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
+        Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
    )
    return cosineSim_filt, embds
 def list_cosSim_dupl_candidates(
    cosineSim_filt: Series,
    embds: dict[int, tuple[Embedding, str]],
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
        list containing relevant index pairs for entries with similarity score greater than
        given threshold
    """
-    logger.info("Start gathering of similarity candidates...")
+    logger.info('Start gathering of similarity candidates...')
    # compare found duplicates
    columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)
    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
-    for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
+    for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
        # get text content from embedding as second tuple entry
-        content = [[
+        content = [
            [
                idx1,
                embds[idx1][1],
                idx2,
                embds[idx2][1],
                score,
-        ]]
+            ]
        ]
        # add candidates to collection DataFrame
        df_conc = pd.DataFrame(columns=columns, data=content)
        if df_candidates.empty:
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
        # save index pairs
        index_pairs.append((idx1, idx2))
-    logger.info("Similarity candidates gathered successfully.")
+    logger.info('Similarity candidates gathered successfully.')
    if save_candidates:
        if saving_path is None:
-            raise ValueError(("Saving path must be provided if duplicate "
+            raise ValueError(
-                          "candidates should be saved."))
+                ('Saving path must be provided if duplicate ' 'candidates should be saved.')
            )
        elif pipeline is not None:
-            target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' 
+            target_filename = (
-                            + filename + '.xlsx')
+                f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
            )
        elif pipeline is None:
            target_filename = f'{filename}.xlsx'
-        logger.info("Saving similarity candidates...")
+        logger.info('Saving similarity candidates...')
        target_path = saving_path.joinpath(target_filename)
        df_candidates.to_excel(target_path)
-        logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
+        logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
    return index_pairs, embds
 # TODO: change implementation fully to SentenceTransformer
 # usage of batch processing for embeddings, use candidate idx function
 # from time analysis --> moved to ``helpers.py``
@ -419,16 +455,24 @@ def similar_ids_groups(
        yield list(id_group)
 """
 def merge_similarity_dupl(
    data: DataFrame,
-    similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
+    model: SentenceTransformer,
    cos_sim_threshold: float,
 ) -> tuple[DataFrame]:
-    logger.info("Start merging of similarity candidates...")
+    logger.info('Start merging of similarity candidates...')
    # data
    merged_data = data.copy()
    model_input = merged_data['entry']
    candidates_idx = candidates_by_index(
        data_model_input=model_input,
        model=model,
        cos_sim_threshold=cos_sim_threshold,
    )
    # graph of similar ids
-    similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
+    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
    for similar_id_group in similar_index_groups(similar_id_graph):
        similar_id_group = list(similar_id_group)
@ -454,10 +498,11 @@ def merge_similarity_dupl(
        merged_data.update(merged_similar_data)
        merged_data = merged_data.drop(index=similar_id_group)
-    logger.info("Similarity candidates merged successfully.")
+    logger.info('Similarity candidates merged successfully.')
    return (merged_data.copy(),)
 # merge duplicates
 def merge_similarity_dupl_old(
    data: DataFrame,
@ -469,8 +514,7 @@ def merge_similarity_dupl_old(
    # logger.info("Start merging of similarity candidates...")
    # iterate over index pairs
-    for (i1, i2) in tqdm(dupl_idx_pairs):
+    for i1, i2 in tqdm(dupl_idx_pairs):
        # if an entry does not exist any more, skip this pair
        if i1 not in index or i2 not in index:
            continue
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
        given threshold
    """
    # compare found duplicates
    columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)
    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
-    for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
+    for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
        # get texts for comparison
        text1 = embds[idx1][1]
        text2 = embds[idx2][1]
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
            continue
        # get text content from embedding as second tuple entry
-        content = [[
+        content = [
            [
                idx1,
                text1,
                idx2,
                text2,
                score,
-        ]]
+            ]
        ]
        df_conc = pd.DataFrame(columns=columns, data=content)
        df_candidates = pd.concat([df_candidates, df_conc])
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -1,11 +1,71 @@
 from typing import cast
 from collections.abc import Iterable, Iterator
 from typing import cast
 import networkx as nx
 import numpy as np
 import numpy.typing as npt
 import sentence_transformers
 import sentence_transformers.util
 from networkx import Graph
 from pandas import Series
 from sentence_transformers import SentenceTransformer
 from torch import Tensor
 from tqdm.auto import tqdm
 from lang_main.analysis.graphs import get_graph_metadata, update_graph
 from lang_main.types import PandasIndex
-from lang_main.analysis.graphs import update_graph, get_graph_metadata
+
 def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
    cos_sim_threshold: float = 0.5,
    # ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
 ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
    """function to filter candidate indices based on cosine similarity
    using SentenceTransformer model in batch mode,
    feed data as Series to retain information about indices of entries and
    access them later in the original dataset
    Parameters
    ----------
    obj_id : ObjectID
        _description_
    data_model_input : Series
        containing indices and text entries to process
    model : SentenceTransformer
        necessary SentenceTransformer model to encode text entries
    cos_sim_threshold : float, optional
        threshold for cosine similarity to filter candidates, by default 0.5
    Yields
    ------
    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
        ObjectID and tuple of index pairs which meet the cosine
        similarity threshold
    """
    # embeddings
    batch = cast(list[str], data_model_input.to_list())
    embds = cast(
        Tensor,
        model.encode(
            batch,
            convert_to_numpy=False,
            convert_to_tensor=True,
            show_progress_bar=False,
        ),
    )
    # cosine similarity
    cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
    np.fill_diagonal(cos_sim, 0.0)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
    for idx_array in cos_sim_idx:
        idx_pair = cast(
            tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
        )
        yield idx_pair
 def similar_index_connection_graph(
@ -15,21 +75,21 @@ def similar_index_connection_graph(
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
-    for (idx1, idx2) in similar_idx_pairs:
+    # for idx1, idx2 in similar_idx_pairs:
-        # inplace operation, parent/child do not really exist in undirected graph
+    #     # inplace operation, parent/child do not really exist in undirected graph
-        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
+    #     update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
    update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
    graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
    return similar_id_graph, graph_info
-# TODO check returning tuple
+
 def similar_index_groups(
    similar_id_graph: Graph,
 ) -> Iterator[tuple[PandasIndex, ...]]:
    # groups of connected indices
-    ids_groups = cast(Iterator[set[PandasIndex]],
+    ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
                      nx.connected_components(G=similar_id_graph))
    for id_group in ids_groups:
        yield tuple(id_group)
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -1,21 +1,17 @@
 from typing import cast
 from collections.abc import Iterable, Iterator
 from typing import cast
 import numpy as np
 import numpy.typing as npt
 from pandas import DataFrame, Series
 from torch import Tensor
 from sentence_transformers import SentenceTransformer
 import sentence_transformers
 import sentence_transformers.util
 from tqdm.auto import tqdm  # TODO: check deletion
 from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
 from lang_main.loggers import logger_timeline as logger
 from lang_main.analysis.shared import (
    candidates_by_index,
    similar_index_connection_graph,
    similar_index_groups,
 )
 from lang_main.loggers import logger_timeline as logger
 from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
 def non_relevant_obj_ids(
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
 ) -> tuple[ObjectID, ...]:
    data = data.copy()
    ids_to_ignore: set[ObjectID] = set()
-    obj_ids = cast(Iterable[ObjectID], # actually NumPy array
+    obj_ids = cast(
-                   data[feature_obj_id].unique())
+        Iterable[ObjectID],  # actually NumPy array
        data[feature_obj_id].unique(),
    )
    for obj_id in obj_ids:
        feats_per_obj_id = cast(
-            Series,
+            Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
            data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
        )
        # check for uniqueness of given feature for current ObjectID
        # ignore NaN values
@ -46,14 +42,15 @@ def non_relevant_obj_ids(
    return tuple(ids_to_ignore)
 def remove_non_relevant_obj_ids(
    data: DataFrame,
    thresh_unique_feat_per_id: int,
    *,
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
-) -> DataFrame:
+) -> tuple[DataFrame]:
-    logger.info("Removing non-relevant ObjectIDs from dataset")
+    logger.info('Removing non-relevant ObjectIDs from dataset')
    data = data.copy()
    ids_to_ignore = non_relevant_obj_ids(
        data=data,
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
    )
    # only retain entries with ObjectIDs not in IDs to ignore
    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
-    logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
+    logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
-    logger.info("Non-relevant ObjectIDs removed successfully")
+    logger.info('Non-relevant ObjectIDs removed successfully')
-    return data
+    return (data,)
 def filter_activities_per_obj_id(
    data: DataFrame,
    activity_feature: str = 'VorgangsTypName',
    relevant_activity_types: Iterable[str] = (
        'Reparaturauftrag (Portal)',
    ),
    feature_obj_id: str = 'ObjektID',
    threshold_num_activities: int = 1,
 ) -> tuple[DataFrame, Series]:
    data = data.copy()
    # filter only relevant activities count occurrences for each ObjectID
    logger.info("Filtering activities per ObjectID")
    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
    data_filter_activities = data.loc[filt_rel_activities].copy()
    num_activities_per_obj_id = cast(
        Series,
        data_filter_activities[feature_obj_id].value_counts(sort=True)
    )
    # filter for ObjectIDs with more than given number of activities
    filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
    # index of series contains ObjectIDs
    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
    filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
                                 .isin(obj_ids_below_thresh))
    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
    logger.info("Activities per ObjectID filtered successfully")
    return data_filter_activities, num_activities_per_obj_id
 def generate_model_input(
    data: DataFrame,
@ -107,8 +74,8 @@ def generate_model_input(
        'VorgangsArtText',
        'VorgangsBeschreibung',
    ),
-) -> DataFrame:
+) -> tuple[DataFrame]:
-    logger.info("Generating concatenation of model input features")
+    logger.info('Generating concatenation of model input features')
    data = data.copy()
    model_input_features = list(model_input_features)
    input_features = data[model_input_features].fillna('').astype(str)
@ -116,9 +83,40 @@ def generate_model_input(
        lambda x: ' - '.join(x),
        axis=1,
    )
-    logger.info("Model input generated successfully")
+    logger.info('Model input generated successfully')
    return (data,)
 def filter_activities_per_obj_id(
    data: DataFrame,
    activity_feature: str = 'VorgangsTypName',
    relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
    feature_obj_id: str = 'ObjektID',
    threshold_num_activities: int = 1,
 ) -> tuple[DataFrame, Series]:
    data = data.copy()
    # filter only relevant activities count occurrences for each ObjectID
    logger.info('Filtering activities per ObjectID')
    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
    data_filter_activities = data.loc[filt_rel_activities].copy()
    num_activities_per_obj_id = cast(
        Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
    )
    # filter for ObjectIDs with more than given number of activities
    filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
    # index of series contains ObjectIDs
    obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
    filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
        obj_ids_below_thresh
    )
    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
    logger.info('Activities per ObjectID filtered successfully')
    return data_filter_activities, num_activities_per_obj_id
    return data
 # for each obj_id in relevant_obj_ids
 ## filter data for obj_id
@ -130,6 +128,7 @@ def generate_model_input(
 ## obtain idx pairs, yield
 ## use idx pairs to get idx values of series
 def get_timeline_candidates_index(
    data: DataFrame,
    num_activities_per_obj_id: Series,
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
    model_input_feature: str = 'nlp_model_input',
 ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
    # already sorted ObjIDs (descending regarding number of activities)
-    obj_ids = cast(Iterable[ObjectID],
+    obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
                   num_activities_per_obj_id.index)
    for obj_id in tqdm(obj_ids):
-        data_per_obj_id = cast(
+        data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
            DataFrame,
            data.loc[data[feature_obj_id]==obj_id]
        )
        data_model_input = data_per_obj_id[model_input_feature]
        candidates_idx = candidates_by_index(
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
            cos_sim_threshold=cos_sim_threshold,
        )
        # directly process candidates
-        candidates_idx = tuple(candidates_idx)
+        # candidates_idx = tuple(candidates_idx)
        similar_id_graph, _ = similar_index_connection_graph(
            similar_idx_pairs=candidates_idx,
        )
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
        for index_group in similar_index_groups(similar_id_graph):
            yield obj_id, index_group
 # TODO: check application for duplicate removal
 def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
    cos_sim_threshold: float = 0.5,
 ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
    """function to filter candidate indices based on cosine similarity
    using SentenceTransformer model in batch mode,
    feed data as Series to retain information about indices of entries and
    access them later in the original dataset
    Parameters
    ----------
    obj_id : ObjectID
        _description_
    data_model_input : Series
        containing indices and text entries to process
    model : SentenceTransformer
        necessary SentenceTransformer model to encode text entries
    cos_sim_threshold : float, optional
        threshold for cosine similarity to filter candidates, by default 0.5
    Yields
    ------
    Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
        ObjectID and tuple of index pairs which meet the cosine 
        similarity threshold
    """
    # embeddings
    batch = cast(list[str],
                 data_model_input.to_list())
    embds = cast(
        Tensor,
        model.encode(
            batch, 
            convert_to_numpy=False,
            convert_to_tensor=True,
            show_progress_bar=False,
        )
    )
    # cosine similarity
    cos_sim = cast(
        npt.NDArray,
        sentence_transformers.util.cos_sim(embds, embds).numpy()
    )
    np.fill_diagonal(cos_sim, 0.)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
    for idx_array in cos_sim_idx:
        idx_pair = cast(
            tuple[np.int64, np.int64],
            tuple(data_model_input.index[idx] for idx in idx_array)
        )
        yield idx_pair
 def transform_timeline_candidates(
    candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
 ) -> TimelineCandidates:
@ -259,20 +199,52 @@ def transform_timeline_candidates(
    return candidates_by_obj_id
-def map_obj_texts(
+
 def map_obj_id_to_texts(
    data: DataFrame,
-    obj_ids: Iterable[ObjectID],
+    feature_obj_id: str = 'ObjektID',
 ) -> dict[ObjectID, str]:
    data = data.copy()
    obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
    obj_id_to_text: dict[ObjectID, str] = {}
-    for obj_id in obj_ids:
+    for obj_id in tqdm(obj_ids):
-        data_per_obj = cast(
+        data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
            DataFrame,
            data.loc[data['ObjektID']==obj_id]
        )
        # just take first entry
        obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
        obj_text = obj_text.strip(r' ,.:')
        obj_id_to_text[obj_id] = obj_text
    return obj_id_to_text
 def get_timeline_candidates(
    data: DataFrame,
    num_activities_per_obj_id: Series,
    *,
    model: SentenceTransformer,
    cos_sim_threshold: float,
    feature_obj_id: str = 'ObjektID',
    model_input_feature: str = 'nlp_model_input',
 ) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
    logger.info('Obtaining timeline candidates...')
    candidates = get_timeline_candidates_index(
        data=data,
        num_activities_per_obj_id=num_activities_per_obj_id,
        model=model,
        cos_sim_threshold=cos_sim_threshold,
        feature_obj_id=feature_obj_id,
        model_input_feature=model_input_feature,
    )
    tl_candidates = transform_timeline_candidates(candidates)
    logger.info('Timeline candidates obtained successfully.')
    # text mapping to obtain object descriptors
    logger.info('Mapping ObjectIDs to their respective text descriptor...')
    map_obj_text = map_obj_id_to_texts(
        data=data,
        feature_obj_id=feature_obj_id,
    )
    logger.info('ObjectIDs successfully mapped to text descriptors.')
    return tl_candidates, map_obj_text
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@ -1,21 +1,20 @@
 from typing import cast
 import re
 from itertools import combinations
 from collections.abc import Iterator
 from itertools import combinations
 from typing import cast
 from dateutil.parser import parse
 from spacy.tokens.token import Token as SpacyToken
 from spacy.tokens.doc import Doc as SpacyDoc
 from spacy.lang.de import German as GermanSpacyModel
 from pandas import DataFrame
 from spacy.lang.de import German as GermanSpacyModel
 from spacy.tokens.doc import Doc as SpacyDoc
 from spacy.tokens.token import Token as SpacyToken
 from tqdm.auto import tqdm
 from lang_main.loggers import logger_token_analysis as logger
 from lang_main.analysis.graphs import (
    update_graph,
    TokenGraph,
    update_graph,
 )
-
+from lang_main.loggers import logger_token_analysis as logger
 # ** Logging
 # LOGGING_LEVEL = 'INFO'
@ -38,13 +37,14 @@ TAG_OF_INTEREST: frozenset[str] = frozenset()
 # ** obtaining connection in texts
 def pre_clean_word(string: str) -> str:
 def pre_clean_word(string: str) -> str:
    pattern = r'[^A-Za-zäöüÄÖÜ]+'
    string = re.sub(pattern, '', string)
    return string
 # https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
 def is_str_date(
    string: str,
@ -67,10 +67,10 @@ def is_str_date(
    except ValueError:
        return False
 def obtain_relevant_descendants(
    token: SpacyToken,
 ) -> Iterator[SpacyToken]:
    for descendant in token.subtree:
        # subtrees contain the token itself
        # if current element is token skip this element
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
        if is_str_date(string=descendant.text):
            continue
-        logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
+        logger.debug(
-                      f">>{descendant}<<, POS >>{descendant.pos_}<<"))
+            (
                f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
                f'>>{descendant}<<, POS >>{descendant.pos_}<<'
            )
        )
        # eliminate cases of cross-references with verbs
-        if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
+        if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
-            (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
+            descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
        ):
            continue
        # skip cases in which descendant is indirect POS with others than verbs
        elif descendant.pos_ in POS_INDIRECT:
@ -99,6 +104,7 @@ def obtain_relevant_descendants(
        # TODO look at results and fine-tune function accordingly
 def add_doc_info_to_graph(
    graph: TokenGraph,
    doc: SpacyDoc,
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
                        graph=graph,
                        parent=token.lemma_,
                        child=descendant.lemma_,
-                        weight_connection=weight
+                        weight_connection=weight,
                    )
            else:
                # if indirect POS, make connection between all associated words
@ -139,6 +145,7 @@ def add_doc_info_to_graph(
                        weight_connection=weight,
                    )
 def build_token_graph(
    data: DataFrame,
    model: GermanSpacyModel,
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -0,0 +1,55 @@
 from pathlib import Path
 from typing import Final
 from lang_main import CONFIG
 # ** paths
 INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
 SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
 PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
 # ** control
 DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
 SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
 DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
 SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
 DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
 SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
 DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
 SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
 # ** export
 # ** preprocessing
 FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
    'filename_cossim_filter_candidates'
 ]
 DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
 THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
    'threshold_amount_characters'
 ]
 THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
 # ** token analysis
 # ** graph postprocessing
 THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
 # ** time analysis.uniqueness
 THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
    'threshold_unique_texts'
 ]
 UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
    'criterion_feature'
 ]
 FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
 # ** time_analysis.model_input
 MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
    CONFIG['time_analysis']['model_input']['input_features']
 )
 ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
 ACTIVITY_TYPES: Final[tuple[str]] = tuple(
    CONFIG['time_analysis']['model_input']['activity_types']
 )
 THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
    'threshold_num_acitivities'
 ]
 THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
    'threshold_similarity'
 ]
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -0,0 +1,56 @@
 # lang_main: Config file
 [paths]
 inputs = './inputs/'
 results = './results/test_new2/'
 dataset = './01_2_Rohdaten_neu/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
 preprocessing = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = false
 time_analysis = false
 time_analysis_skip = false
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum", 
    "ErledigungsDatum", 
    "Arbeitsbeginn", 
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.model_input]
 input_features = [
    'VorgangsTypName',
    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -1,5 +1,5 @@
 from typing import Final
 import logging
 from typing import Final
 from lang_main.types import LoggingLevels
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -1,20 +1,18 @@
 from typing import Any
 #from types import FunctionType
 import sys
 import logging
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any
 from lang_main.loggers import logger_pipelines as logger
-from lang_main.shared import save_pickle, load_pickle
+from lang_main.shared import load_pickle, save_pickle
 # ** pipelines to perform given actions on dataset in a customisable manner
 class NoPerformableActionError(Exception):
    """Error describing that no action is available in the current pipeline"""
 class BasePipeline():
 class BasePipeline:
    def __init__(
        self,
        name: str,
@ -27,6 +25,8 @@ class BasePipeline():
        self.name = name
        # working directory for pipeline == output path
        self.working_dir = working_dir
        # if not self.working_dir.exists():
        #     self.working_dir.mkdir(parents=True)
        # container for actions to perform during pass
        self.actions: list[Callable] = []
@ -39,8 +39,10 @@ class BasePipeline():
        self._intermediate_result: Any | None = None
    def __repr__(self) -> str:
-        return (f"{self.__class__.__name__}(name: {self.name}, "
+        return (
-                f"working dir: {self.working_dir}, contents: {self.action_names})")
+            f'{self.__class__.__name__}(name: {self.name}, '
            f'working dir: {self.working_dir}, contents: {self.action_names})'
        )
    @property
    def intermediate_result(self) -> Any:
@ -60,8 +62,9 @@ class BasePipeline():
            self.actions_kwargs.append(action_kwargs.copy())
            self.is_save_result.append(save_result)
        else:
-            raise TypeError(("Action must be custom function, "
+            raise TypeError(
-                             f"but is of type >>{type(action)}<<."))
+                f'Action must be custom function, but is of type >>{type(action)}<<.'
            )
    # TODO: add multiple entries by utilising simple add method
    """
@ -107,13 +110,14 @@ class BasePipeline():
        return data
    def prep_run(self) -> None:
-        logger.info(f"Starting processing pipeline >>{self.name}<<...")
+        logger.info(f'Starting processing pipeline >>{self.name}<<...')
        # progress tracking
        self.curr_proc_idx = 1
        # check if performable actions available
        if len(self.actions) == 0:
-            raise NoPerformableActionError(("The pipeline does not contain any "
+            raise NoPerformableActionError(
-                                           "performable actions."))
+                ('The pipeline does not contain any ' 'performable actions.')
            )
    def run(
        self,
@ -135,6 +139,6 @@ class BasePipeline():
            # processing tracking
            self.curr_proc_idx += 1
-        logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
+        logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
        return ret
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -1,57 +1,144 @@
 from sentence_transformers import SentenceTransformer
 import spacy
 from sentence_transformers import SentenceTransformer
 from lang_main import (
    SAVE_PATH_FOLDER,
    DATE_COLS,
    FILENAME_COSSIM_FILTER_CANDIDATES,
    THRESHOLD_SIMILARITY,
 )
 from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.preprocessing import (
-    load_raw_data,
+    analyse_feature,
    remove_duplicates,
    remove_NA,
    clean_string_slim,
    entry_wise_cleansing,
-    analyse_feature,
+    load_raw_data,
    build_cosSim_matrix,
    filt_thresh_cosSim_matrix,
    list_cosSim_dupl_candidates,
    merge_similarity_dupl,
    remove_duplicates,
    remove_NA,
 )
 from lang_main.analysis.timeline import (
    filter_activities_per_obj_id,
    generate_model_input,
    get_timeline_candidates,
    remove_non_relevant_obj_ids,
 )
 from lang_main.analysis.tokens import build_token_graph
 from lang_main.constants import (
    ACTIVITY_FEATURE,
    ACTIVITY_TYPES,
    DATE_COLS,
    FEATURE_NAME_OBJ_ID,
    MODEL_INPUT_FEATURES,
    SAVE_PATH_FOLDER,
    THRESHOLD_NUM_ACTIVITIES,
    THRESHOLD_SIMILARITY,
    THRESHOLD_TIMELINE_SIMILARITY,
    THRESHOLD_UNIQUE_TEXTS,
    UNIQUE_CRITERION_FEATURE,
 )
 from lang_main.pipelines.base import BasePipeline
 # ** pipeline configuration
 # ** target feature preparation
 pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
-pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
+pipe_target_feat.add(
    load_raw_data,
    {
        'date_cols': DATE_COLS,
    },
 )
 pipe_target_feat.add(remove_duplicates)
 pipe_target_feat.add(remove_NA, save_result=True)
-pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
+pipe_target_feat.add(
-pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
+    entry_wise_cleansing,
    {
        'target_feature': 'VorgangsBeschreibung',
        'cleansing_func': clean_string_slim,
    },
 )
 pipe_target_feat.add(
    analyse_feature,
    {
        'target_feature': 'VorgangsBeschreibung',
    },
    save_result=True,
 )
 # output: DataFrame containing target feature with
 # number of occurrences and associated ObjectIDs
 # ** embedding pipe
 # ?? still needed?
 # using similarity between entries to catch duplicates with typo or similar content
-pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
+# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
 model_spacy = spacy.load('de_dep_news_trf')
 model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
-pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
+# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
-pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
+# pipe_embds.add(
-pipe_embds.add(
+#     filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
-    list_cosSim_dupl_candidates, 
+# )
-    {'save_candidates': True, 
+# pipe_embds.add(
-     'saving_path': SAVE_PATH_FOLDER,
+#     list_cosSim_dupl_candidates,
-     'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
+#     {
-     'pipeline': pipe_embds}, save_result=True)
+#         'save_candidates': True,
 #         'saving_path': SAVE_PATH_FOLDER,
 #         'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
 #         'pipeline': pipe_embds,
 #     },
 #     save_result=True,
 # )
 # ** Merge duplicates
 pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
-pipe_merge.add(merge_similarity_dupl, save_result=True)
+# pipe_merge.add(merge_similarity_dupl, save_result=True)
 pipe_merge.add(
    merge_similarity_dupl,
    {
        'model': model_stfr,
        'cos_sim_threshold': THRESHOLD_SIMILARITY,
    },
    save_result=True,
 )
 # ** token analysis
 pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
-pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
+pipe_token_analysis.add(
    build_token_graph,
    {
        'model': model_spacy,
    },
    save_result=True,
 )
 # ** timeline analysis
 pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
 pipe_timeline.add(
    remove_non_relevant_obj_ids,
    {
        'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
        'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
        'feature_obj_id': FEATURE_NAME_OBJ_ID,
    },
    save_result=True,
 )
 pipe_timeline.add(
    generate_model_input,
    {
        'target_feature_name': 'nlp_model_input',
        'model_input_features': MODEL_INPUT_FEATURES,
    },
 )
 pipe_timeline.add(
    filter_activities_per_obj_id,
    {
        'activity_feature': ACTIVITY_FEATURE,
        'relevant_activity_types': ACTIVITY_TYPES,
        'feature_obj_id': FEATURE_NAME_OBJ_ID,
        'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
    },
 )
 pipe_timeline.add(
    get_timeline_candidates,
    {
        'model': model_stfr,
        'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
        'feature_obj_id': FEATURE_NAME_OBJ_ID,
        'model_input_feature': 'nlp_model_input',
    },
    save_result=True,
 )
--- a/src/lang_main/shared.py
+++ b/src/lang_main/shared.py
@ -1,38 +1,47 @@
 from typing import Any
 import os
 import shutil
 import pickle
 import shutil
 import tomllib
 from pathlib import Path
 from typing import Any
 from lang_main.loggers import logger_shared_helpers as logger
 # ** Lib
 def create_saving_folder(
    saving_path_folder: str | Path,
    overwrite_existing: bool = False,
 ) -> None:
    # check for existence of given path
-    if not os.path.exists(saving_path_folder):
+    if isinstance(saving_path_folder, str):
-        os.makedirs(saving_path_folder)
+        saving_path_folder = Path(saving_path_folder)
    if not saving_path_folder.exists():
        saving_path_folder.mkdir(parents=True)
    else:
        if overwrite_existing:
            # overwrite if desired (deletes whole path and re-creates it)
            shutil.rmtree(saving_path_folder)
            os.makedirs(saving_path_folder)
        else:
-            logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
+            logger.info(
-                         "unchanged. If you want to overwrite this path, use parameter "
+                (
-                         ">>overwrite_existing<<."))
+                    f'Path >>{saving_path_folder}<< already exists and remained '
                    f'unchanged. If you want to overwrite this path, use parameter '
                    f'>>overwrite_existing<<.'
                )
            )
 def load_toml_config(
    path_to_toml: str | Path,
 ) -> dict[str, Any]:
-    with open(path_to_toml, "rb") as f:
+    with open(path_to_toml, 'rb') as f:
        data = tomllib.load(f)
-    logger.info("Loaded TOML config file successfully.")
+    logger.info('Loaded TOML config file successfully.')
    return data
 # saving and loading using pickle
 # careful: pickling from unknown sources can be dangerous
 def save_pickle(
@ -41,16 +50,18 @@ def save_pickle(
 ) -> None:
    with open(path, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
-    logger.info(f"Saved file successfully under {path}")
+    logger.info(f'Saved file successfully under {path}')
 def load_pickle(
    path: str | Path,
 ) -> Any:
    with open(path, 'rb') as file:
        obj = pickle.load(file)
-    logger.info("Loaded file successfully.")
+    logger.info('Loaded file successfully.')
    return obj
 # TODO: remove, too specialised for common application
 """
 def filter_candidates_idx(
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,4 +1,4 @@
-from typing import TypeAlias, Literal
+from typing import Literal, TypeAlias
 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
--- a/test-notebooks/Preprocess_Pipeline.ipynb
+++ b/test-notebooks/Preprocess_Pipeline.ipynb
@ -13,29 +13,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
   "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "ModuleNotFoundError",
-     "output_type": "stream",
+     "evalue": "No module named 'ihm_analyse'",
-     "text": [
+     "output_type": "error",
-      "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
+     "traceback": [
-     ]
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-    },
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-    {
+      "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m      3\u001b[0m     load_raw_data,\n\u001b[0;32m      4\u001b[0m     remove_duplicates,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m     merge_similarity_dupl,\n\u001b[0;32m     13\u001b[0m )\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
-     "name": "stderr",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
     "output_type": "stream",
     "text": [
      "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
-    "from ihm_analyse import CONFIG\n",
+    "from lang_main import CONFIG\n",
-    "from ihm_analyse.lib.preprocess import (\n",
+    "from lang_main.lib.preprocess import (\n",
    "    load_raw_data,\n",
    "    remove_duplicates,\n",
    "    remove_NA,\n",
@ -47,8 +43,8 @@
    "    list_cosSim_dupl_candidates,\n",
    "    merge_similarity_dupl,\n",
    ")\n",
-    "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
+    "from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
-    "from ihm_analyse.lib.helpers import (\n",
+    "from lang_main.lib.helpers import (\n",
    "    save_pickle, \n",
    "    load_pickle, \n",
    "    create_saving_folder,\n",
--- a/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@ -1,28 +1,42 @@
 from typing import cast
 from pathlib import Path
 import pandas as pd
 import plotly.express as px
 from dash import (
    Dash,
    html,
    dcc,
    callback,
    Output,
    Input,
    Output,
    State,
    callback,
    dash_table,
    dcc,
    html,
 )
 import plotly.express as px
 import pandas as pd
 from pandas import DataFrame
 from lang_main import load_pickle
-from lang_main.types import TimelineCandidates, ObjectID
+from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame
 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
 # ** data
-data = cast(DataFrame, load_pickle('./data.pkl'))
+p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
-cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
+p_tl = Path(
-texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
+    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
 )
 ret = cast(DataFrame, load_pickle(p_df))
 data = ret[0]
 ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
 cands = ret[0]
 texts = ret[1]
 # p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
 # p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
 # p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
 # data = cast(DataFrame, load_pickle(p_df))
 # cands = cast(TimelineCandidates, load_pickle(p_cands))
 # texts = cast(dict[ObjectID, str], load_pickle(p_map))
 table_feats = [
    'ErstellungsDatum',
    'ErledigungsDatum',
@ -53,23 +67,26 @@ app = Dash(prevent_initial_callbacks=True)
 app.layout = [
    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
-    html.Div(children=[
+    html.Div(
        children=[
            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
            dcc.Dropdown(
                list(cands.keys()),
                id='dropdown-selection',
-            placeholder="ObjektID auswählen...",
+                placeholder='ObjektID auswählen...',
-        )
+            ),
-    ]),
+        ]
-    html.Div(children=[
+    ),
    html.Div(
        children=[
            html.H3(id='object_text'),
            dcc.Dropdown(id='choice-candidates'),
            dcc.Graph(id='graph-output'),
    ]),
    html.Div(children=[
        dash_table.DataTable(id='table-candidates')
    ]),
        ]
    ),
    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
 ]
@callback(
    Output('object_text', 'children'),
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
    headline = f'HObjektText: {obj_text}'
    return headline
@callback(
    Output('choice-candidates', 'options'),
    Input('dropdown-selection', 'value'),
@ -93,6 +111,7 @@ def update_choice_candidates(obj_id):
    choices = list(range(1, len(cands_obj_id) + 1))
    return choices
@callback(
    Output('graph-output', 'figure'),
    Input('choice-candidates', 'value'),
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
        title=title,
        hover_data=hover_data,
    )
-    fig.update_traces(
+    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
        mode='markers+lines',
        marker=markers,
        marker_symbol='diamond'
    )
    fig.update_xaxes(
-        tickformat="%B\n%Y",
+        tickformat='%B\n%Y',
        rangeslider_visible=True,
    )
    fig.update_yaxes(type='category')
-    fig.update_layout(hovermode="x unified")
+    fig.update_layout(hovermode='x unified')
    return fig
@callback(
-    [Output('table-candidates', 'data'),
+    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
     Output('table-candidates', 'columns')],
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
@ -144,10 +159,10 @@ def update_table_candidates(index, obj_id):
    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()
-    df = (df
+    df = df.filter(items=table_feats, axis=1).sort_values(
-          .filter(items=table_feats, axis=1)
+        by='ErstellungsDatum', ascending=True
-          .sort_values(by='ErstellungsDatum', ascending=True))
+    )
-    cols = [{"name": i, "id": i} for i in df.columns]
+    cols = [{'name': i, 'id': i} for i in df.columns]
    # convert dates to strings
    for col in table_feats_dates:
        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
    table_data = df.to_dict('records')
    return table_data, cols
 if __name__ == '__main__':
    app.run(debug=True)
--- a/test-notebooks/dashboard/archive/data.pkl
+++ b/test-notebooks/dashboard/archive/data.pkl
--- a/test-notebooks/dashboard/archive/map_candidates.pkl
+++ b/test-notebooks/dashboard/archive/map_candidates.pkl
--- a/test-notebooks/dashboard/archive/map_texts.pkl
+++ b/test-notebooks/dashboard/archive/map_texts.pkl
--- a/test-notebooks/dashboard/lang_main_config.toml
+++ b/test-notebooks/dashboard/lang_main_config.toml
@ -0,0 +1,56 @@
 # lang_main: Config file
 [paths]
 inputs = './inputs/'
 results = './results/test_new2/'
 dataset = './01_2_Rohdaten_neu/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
 preprocessing = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = false
 time_analysis = false
 time_analysis_skip = false
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum", 
    "ErledigungsDatum", 
    "Arbeitsbeginn", 
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.model_input]
 input_features = [
    'VorgangsTypName',
    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/test-notebooks/display_results.ipynb
+++ b/test-notebooks/display_results.ipynb
@ -0,0 +1,663 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "from lang_main import load_pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_files(path: str) -> tuple[Path, ...]:\n",
    "    p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
    "    assert p.exists(), \"path does not exist\"\n",
    "    return tuple(p.glob(r'*'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
     ]
    }
   ],
   "source": [
    "file = files[-1]\n",
    "ret = load_pickle(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>entry</th>\n",
       "      <th>len</th>\n",
       "      <th>num_occur</th>\n",
       "      <th>assoc_obj_ids</th>\n",
       "      <th>num_assoc_obj_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>162</th>\n",
       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
       "      <td>66</td>\n",
       "      <td>92592</td>\n",
       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
       "      <td>206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
       "      <td>39</td>\n",
       "      <td>3108</td>\n",
       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
       "      <td>37</td>\n",
       "      <td>1619</td>\n",
       "      <td>[0, 970, 2134, 2137]</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
       "      <td>36</td>\n",
       "      <td>1265</td>\n",
       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
       "      <td>44</td>\n",
       "      <td>687</td>\n",
       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2559</th>\n",
       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
       "      <td>46</td>\n",
       "      <td>1</td>\n",
       "      <td>[211]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2558</th>\n",
       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
       "      <td>30</td>\n",
       "      <td>1</td>\n",
       "      <td>[93]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2557</th>\n",
       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
       "      <td>40</td>\n",
       "      <td>1</td>\n",
       "      <td>[1707]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2556</th>\n",
       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
       "      <td>173</td>\n",
       "      <td>1</td>\n",
       "      <td>[1]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6782</th>\n",
       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
       "      <td>106</td>\n",
       "      <td>2</td>\n",
       "      <td>[306, 326]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4545 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  entry  ... num_assoc_obj_ids\n",
       "162   Tägliche Wartungstätigkeiten nach Vorgabe des ...  ...               206\n",
       "33              Wöchentliche Sichtkontrolle / Reinigung  ...                74\n",
       "131               Tägliche Überprüfung der Ölabscheider  ...                 4\n",
       "160                Wöchentliche Kontrolle der C-Anlagen  ...                11\n",
       "140        Halbjährliche Kontrolle des Stabbreithalters  ...               166\n",
       "...                                                 ...  ...               ...\n",
       "2559     Fehler 9723 Leistungsversorgung Antrieb defekt  ...                 1\n",
       "2558                     T-Warp-Let-Off1  schleppfehler  ...                 1\n",
       "2557           Fahrräder wurden gewartet und gereinigt.  ...                 1\n",
       "2556  Bohrlöcher an Gebots- und Verbotszeichen anbri...  ...                 1\n",
       "6782  Befestigung Deckel für Batteriefach defekt    ...  ...                 2\n",
       "\n",
       "[4545 rows x 5 columns]"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ret[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
       " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
     ]
    }
   ],
   "source": [
    "file = files[-1]\n",
    "ret = load_pickle(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>entry</th>\n",
       "      <th>len</th>\n",
       "      <th>num_occur</th>\n",
       "      <th>assoc_obj_ids</th>\n",
       "      <th>num_assoc_obj_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>162</td>\n",
       "      <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
       "      <td>66</td>\n",
       "      <td>92592</td>\n",
       "      <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
       "      <td>206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>33</td>\n",
       "      <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
       "      <td>39</td>\n",
       "      <td>3108</td>\n",
       "      <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>131</td>\n",
       "      <td>Tägliche Überprüfung der Ölabscheider</td>\n",
       "      <td>37</td>\n",
       "      <td>1619</td>\n",
       "      <td>[0, 970, 2134, 2137]</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>160</td>\n",
       "      <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
       "      <td>36</td>\n",
       "      <td>1265</td>\n",
       "      <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>140</td>\n",
       "      <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
       "      <td>44</td>\n",
       "      <td>687</td>\n",
       "      <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6756</th>\n",
       "      <td>2559</td>\n",
       "      <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
       "      <td>46</td>\n",
       "      <td>1</td>\n",
       "      <td>[211]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6757</th>\n",
       "      <td>2558</td>\n",
       "      <td>T-Warp-Let-Off1  schleppfehler</td>\n",
       "      <td>30</td>\n",
       "      <td>1</td>\n",
       "      <td>[93]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6758</th>\n",
       "      <td>2557</td>\n",
       "      <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
       "      <td>40</td>\n",
       "      <td>1</td>\n",
       "      <td>[1707]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6759</th>\n",
       "      <td>2556</td>\n",
       "      <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
       "      <td>173</td>\n",
       "      <td>1</td>\n",
       "      <td>[1]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6760</th>\n",
       "      <td>6782</td>\n",
       "      <td>Befestigung Deckel für Batteriefach defekt    ...</td>\n",
       "      <td>106</td>\n",
       "      <td>2</td>\n",
       "      <td>[306, 326]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4545 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      index  ... num_assoc_obj_ids\n",
       "0       162  ...               206\n",
       "1        33  ...                74\n",
       "2       131  ...                 4\n",
       "3       160  ...                11\n",
       "4       140  ...               166\n",
       "...     ...  ...               ...\n",
       "6756   2559  ...                 1\n",
       "6757   2558  ...                 1\n",
       "6758   2557  ...                 1\n",
       "6759   2556  ...                 1\n",
       "6760   6782  ...                 2\n",
       "\n",
       "[4545 rows x 6 columns]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ret[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "cbf0b450-ec00-471f-9627-717e52c5471d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = tuple(i for i in range(100000000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6ade9c6f4e61410fb93f35e43222705b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100000000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "num = 0\n",
    "for i in tqdm(tup):\n",
    "    num += i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4999999950000000"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36366147-3632-4518-936e-878563305e49",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "17156aa0-8fd6-407b-b014-698df0e534a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.random.rand(1000,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = np.argwhere(arr >= 0.97)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30220"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "5b78436e-a828-42bd-a5ed-ae6045349391",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = idx[:200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "75edc50e-b64c-4319-8f74-27653ed3452c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "tuple(map(tuple, batch))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "tuple(tuple(x) for x in batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }