enhanced timeline analysis

2024-05-22 18:11:46 +02:00 · 2024-05-22 18:11:46 +02:00 · 5d2c97165a
commit 5d2c97165a
parent df16b29191
18 changed files with 2789 additions and 75 deletions
--- a/pdm.lock
+++ b/pdm.lock
@ -5,7 +5,7 @@
 groups = ["default", "notebooks", "trials"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:fc88dc465a3d04eb53b847d7b58db1e55ce8adb004489102cceea01fb52527dc"
+content_hash = "sha256:7574154c6728ede3eaf76a8b1a3b5d4339fcc8f2dc8c41042401004b6583e151"

 [[package]]
 name = "annotated-types"
@ -182,6 +182,17 @@ files = [
    {file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
 ]

+[[package]]
+name = "blinker"
+version = "1.8.2"
+requires_python = ">=3.8"
+summary = "Fast, simple object-to-object and broadcast signaling"
+groups = ["trials"]
+files = [
+    {file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"},
+    {file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"},
+]
+
 [[package]]
 name = "blis"
 version = "0.7.11"
@ -220,7 +231,7 @@ name = "certifi"
 version = "2024.2.2"
 requires_python = ">=3.6"
 summary = "Python package for providing Mozilla's CA Bundle."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
    {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
@ -265,7 +276,7 @@ name = "charset-normalizer"
 version = "3.3.2"
 requires_python = ">=3.7.0"
 summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
@ -306,7 +317,7 @@ name = "click"
 version = "8.1.7"
 requires_python = ">=3.7"
 summary = "Composable command line interface toolkit"
-groups = ["default"]
+groups = ["default", "trials"]
 dependencies = [
    "colorama; platform_system == \"Windows\"",
 ]
@ -331,7 +342,7 @@ name = "colorama"
 version = "0.4.6"
 requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 summary = "Cross-platform colored terminal text."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 marker = "platform_system == \"Windows\" or sys_platform == \"win32\""
 files = [
    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
@ -386,6 +397,61 @@ files = [
    {file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"},
 ]

+[[package]]
+name = "dash"
+version = "2.17.0"
+requires_python = ">=3.8"
+summary = "A Python framework for building reactive web-apps. Developed by Plotly."
+groups = ["trials"]
+dependencies = [
+    "Flask<3.1,>=1.0.4",
+    "Werkzeug<3.1",
+    "dash-core-components==2.0.0",
+    "dash-html-components==2.0.0",
+    "dash-table==5.0.0",
+    "importlib-metadata",
+    "nest-asyncio",
+    "plotly>=5.0.0",
+    "requests",
+    "retrying",
+    "setuptools",
+    "typing-extensions>=4.1.1",
+]
+files = [
+    {file = "dash-2.17.0-py3-none-any.whl", hash = "sha256:2421569023b2cd46ea2d4b2c14fe72c71b7436527a3102219b2265fa361e7c67"},
+    {file = "dash-2.17.0.tar.gz", hash = "sha256:d065cd88771e45d0485993be0d27565e08918cb7edd18e31ee1c5b41252fc2fa"},
+]
+
+[[package]]
+name = "dash-core-components"
+version = "2.0.0"
+summary = "Core component suite for Dash"
+groups = ["trials"]
+files = [
+    {file = "dash_core_components-2.0.0-py3-none-any.whl", hash = "sha256:52b8e8cce13b18d0802ee3acbc5e888cb1248a04968f962d63d070400af2e346"},
+    {file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
+]
+
+[[package]]
+name = "dash-html-components"
+version = "2.0.0"
+summary = "Vanilla HTML components for Dash"
+groups = ["trials"]
+files = [
+    {file = "dash_html_components-2.0.0-py3-none-any.whl", hash = "sha256:b42cc903713c9706af03b3f2548bda4be7307a7cf89b7d6eae3da872717d1b63"},
+    {file = "dash_html_components-2.0.0.tar.gz", hash = "sha256:8703a601080f02619a6390998e0b3da4a5daabe97a1fd7a9cebc09d015f26e50"},
+]
+
+[[package]]
+name = "dash-table"
+version = "5.0.0"
+summary = "Dash table"
+groups = ["trials"]
+files = [
+    {file = "dash_table-5.0.0-py3-none-any.whl", hash = "sha256:19036fa352bb1c11baf38068ec62d172f0515f73ca3276c79dee49b95ddc16c9"},
+    {file = "dash_table-5.0.0.tar.gz", hash = "sha256:18624d693d4c8ef2ddec99a6f167593437a7ea0bf153aa20f318c170c5bc7308"},
+]
+
 [[package]]
 name = "debugpy"
 version = "1.8.1"
@ -459,6 +525,24 @@ files = [
    {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"},
 ]

+[[package]]
+name = "flask"
+version = "3.0.3"
+requires_python = ">=3.8"
+summary = "A simple framework for building complex web applications."
+groups = ["trials"]
+dependencies = [
+    "Jinja2>=3.1.2",
+    "Werkzeug>=3.0.0",
+    "blinker>=1.6.2",
+    "click>=8.1.3",
+    "itsdangerous>=2.1.2",
+]
+files = [
+    {file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"},
+    {file = "flask-3.0.3.tar.gz", hash = "sha256:ceb27b0af3823ea2737928a4d99d125a06175b8512c445cbd9a9ce200ef76842"},
+]
+
 [[package]]
 name = "fqdn"
 version = "1.5.1"
@ -550,12 +634,26 @@ name = "idna"
 version = "3.7"
 requires_python = ">=3.5"
 summary = "Internationalized Domain Names in Applications (IDNA)"
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]

+[[package]]
+name = "importlib-metadata"
+version = "7.1.0"
+requires_python = ">=3.8"
+summary = "Read metadata from Python packages"
+groups = ["trials"]
+dependencies = [
+    "zipp>=0.5",
+]
+files = [
+    {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
+    {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
+]
+
 [[package]]
 name = "intel-openmp"
 version = "2021.4.0"
@ -651,6 +749,17 @@ files = [
    {file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"},
 ]

+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+requires_python = ">=3.8"
+summary = "Safely pass data to untrusted environments and back."
+groups = ["trials"]
+files = [
+    {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
+    {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
+]
+
 [[package]]
 name = "jedi"
 version = "0.19.1"
@ -670,7 +779,7 @@ name = "jinja2"
 version = "3.1.4"
 requires_python = ">=3.7"
 summary = "A very fast and expressive template engine."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 dependencies = [
    "MarkupSafe>=2.0",
 ]
@ -1038,7 +1147,7 @@ name = "markupsafe"
 version = "2.1.5"
 requires_python = ">=3.7"
 summary = "Safely add untrusted strings to HTML/XML markup."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
@ -1203,7 +1312,7 @@ name = "nest-asyncio"
 version = "1.6.0"
 requires_python = ">=3.5"
 summary = "Patch asyncio to allow nested event loops"
-groups = ["notebooks"]
+groups = ["notebooks", "trials"]
 files = [
    {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
    {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
@ -1974,7 +2083,7 @@ name = "requests"
 version = "2.31.0"
 requires_python = ">=3.7"
 summary = "Python HTTP for Humans."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 dependencies = [
    "certifi>=2017.4.17",
    "charset-normalizer<4,>=2",
@ -1986,6 +2095,19 @@ files = [
    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
 ]

+[[package]]
+name = "retrying"
+version = "1.3.4"
+summary = "Retrying"
+groups = ["trials"]
+dependencies = [
+    "six>=1.7.0",
+]
+files = [
+    {file = "retrying-1.3.4-py3-none-any.whl", hash = "sha256:8cc4d43cb8e1125e0ff3344e9de678fefd85db3b750b81b2240dc0183af37b35"},
+    {file = "retrying-1.3.4.tar.gz", hash = "sha256:345da8c5765bd982b1d1915deb9102fd3d1f7ad16bd84a9700b85f64d24e8f3e"},
+]
+
 [[package]]
 name = "rfc3339-validator"
 version = "0.1.4"
@ -2229,7 +2351,7 @@ name = "setuptools"
 version = "69.5.1"
 requires_python = ">=3.8"
 summary = "Easily download, build, install, upgrade, and uninstall Python packages"
-groups = ["default"]
+groups = ["default", "trials"]
 files = [
    {file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
    {file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
@ -2240,7 +2362,7 @@ name = "six"
 version = "1.16.0"
 requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 summary = "Python 2 and 3 compatibility utilities"
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@ -2806,7 +2928,7 @@ name = "typing-extensions"
 version = "4.11.0"
 requires_python = ">=3.8"
 summary = "Backported and Experimental Type Hints for Python 3.8+"
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
    {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
@ -2839,7 +2961,7 @@ name = "urllib3"
 version = "2.2.1"
 requires_python = ">=3.8"
 summary = "HTTP library with thread-safe connection pooling, file post, and more."
-groups = ["default", "notebooks"]
+groups = ["default", "notebooks", "trials"]
 files = [
    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
@ -2923,6 +3045,20 @@ files = [
    {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
 ]

+[[package]]
+name = "werkzeug"
+version = "3.0.3"
+requires_python = ">=3.8"
+summary = "The comprehensive WSGI web application library."
+groups = ["trials"]
+dependencies = [
+    "MarkupSafe>=2.1.1",
+]
+files = [
+    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
+    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
+]
+
 [[package]]
 name = "widgetsnbextension"
 version = "4.0.10"
@ -2933,3 +3069,14 @@ files = [
    {file = "widgetsnbextension-4.0.10-py3-none-any.whl", hash = "sha256:d37c3724ec32d8c48400a435ecfa7d3e259995201fbefa37163124a9fcb393cc"},
    {file = "widgetsnbextension-4.0.10.tar.gz", hash = "sha256:64196c5ff3b9a9183a8e699a4227fb0b7002f252c814098e66c4d1cd0644688f"},
 ]
+
+[[package]]
+name = "zipp"
+version = "3.18.2"
+requires_python = ">=3.8"
+summary = "Backport of pathlib-compatible object wrapper for zip files"
+groups = ["trials"]
+files = [
+    {file = "zipp-3.18.2-py3-none-any.whl", hash = "sha256:dce197b859eb796242b0622af1b8beb0a722d52aa2f57133ead08edd5bf5374e"},
+    {file = "zipp-3.18.2.tar.gz", hash = "sha256:6278d9ddbcfb1f1089a88fde84481528b07b0e10474e09dcfe53dad4069fa059"},
+]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,4 +32,5 @@ notebooks = [
 ]
 trials = [
    "plotly>=5.22.0",
+    "dash>=2.17.0",
 ]
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -1,5 +1,8 @@
 from typing import Final, Any
 import inspect
+import sys
+import logging
+from time import gmtime
 from pathlib import Path

 from lang_main.shared import (
@ -11,7 +14,6 @@ from lang_main.shared import (
 from lang_main.analysis.preprocessing import Embedding, PandasIndex
 from lang_main.analysis.graphs import TokenGraph

-
 __all__ = [
    'save_pickle',
    'load_pickle',
@ -21,6 +23,15 @@ __all__ = [
    'TokenGraph',
 ]

+logging.Formatter.converter = gmtime
+LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
+LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
+logging.basicConfig(
+    stream=sys.stdout,
+    format=LOG_FMT,
+    datefmt=LOG_DATE_FMT,
+)
+
 USE_INTERNAL_CONFIG: Final[bool] = True

 # load config data: internal/external
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -1,7 +1,6 @@
 import typing
 from typing import Any, Self, Literal, overload, Final
 import sys
-import logging
 from collections.abc import Hashable
 from pathlib import Path
 import copy
@ -12,14 +11,12 @@ from networkx import Graph, DiGraph
 import networkx as nx
 from pandas import DataFrame

+from lang_main.loggers import logger_graphs as logger
 from lang_main.shared import save_pickle, load_pickle

 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False

-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.graphs')

 def get_graph_metadata(
    graph: Graph | DiGraph,
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,7 +1,5 @@
 from typing import cast, Callable
 from collections.abc import Iterable
-import sys
-import logging
 from itertools import combinations
 import re
 from math import factorial
@ -19,6 +17,7 @@ import sentence_transformers.util
 from tqdm import tqdm

 from lang_main.types import Embedding, PandasIndex
+from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import BasePipeline
 from lang_main.analysis.shared import (
    similar_index_connection_graph,
@ -27,10 +26,6 @@ from lang_main.analysis.shared import (
 #from lang_main.analysis.graphs import update_graph, get_graph_metadata


-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.preprocess')
-
 # ** (1) dataset preparation: loading and simple preprocessing
 # following functions used to load a given dataset and perform simple
 # duplicate cleansing based on all properties
@ -436,6 +431,7 @@ def merge_similarity_dupl(
    similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
    
    for similar_id_group in similar_index_groups(similar_id_graph):
+        similar_id_group = list(similar_id_group)
        similar_data = merged_data.loc[similar_id_group,:]
        # keep first entry with max number occurrences, then number of 
        # associated objects, then length of entry
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -19,16 +19,17 @@ def similar_index_connection_graph(
        # inplace operation, parent/child do not really exist in undirected graph
        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
    
-    graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
+    graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
    
    return similar_id_graph, graph_info

+# TODO check returning tuple
 def similar_index_groups(
    similar_id_graph: Graph,
-) -> Iterator[list[PandasIndex]]:
+) -> Iterator[tuple[PandasIndex, ...]]:
    # groups of connected indices
    ids_groups = cast(Iterator[set[PandasIndex]],
                      nx.connected_components(G=similar_id_graph))
    
    for id_group in ids_groups:
-        yield list(id_group)
+        yield tuple(id_group)
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -1,6 +1,4 @@
 from typing import cast
-import sys
-import logging
 from collections.abc import Iterable, Iterator

 import numpy as np
@ -12,16 +10,13 @@ import sentence_transformers
 import sentence_transformers.util
 from tqdm.auto import tqdm # TODO: check deletion

-from lang_main.types import PandasIndex, ObjectID
+from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
+from lang_main.loggers import logger_timeline as logger
 from lang_main.analysis.shared import (
    similar_index_connection_graph,
    similar_index_groups,
 )

-# ** Logging
-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.time_analysis')

 def non_relevant_obj_ids(
    data: DataFrame,
@ -42,6 +37,8 @@ def non_relevant_obj_ids(
            data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
        )
        # check for uniqueness of given feature for current ObjectID
+        # ignore NaN values
+        feats_per_obj_id = feats_per_obj_id.dropna()
        unique_feats_per_obj_id = len(feats_per_obj_id.unique())
        
        if unique_feats_per_obj_id > thresh_unique_feat_per_id:
@ -56,7 +53,7 @@ def remove_non_relevant_obj_ids(
    feature_uniqueness: str = 'HObjektText',
    feature_obj_id: str = 'ObjektID',
 ) -> DataFrame:
-    
+    logger.info("Removing non-relevant ObjectIDs from dataset")
    data = data.copy()
    ids_to_ignore = non_relevant_obj_ids(
        data=data,
@ -65,7 +62,9 @@ def remove_non_relevant_obj_ids(
        feature_obj_id=feature_obj_id,
    )
    # only retain entries with ObjectIDs not in IDs to ignore
-    data = data.loc[~data[feature_obj_id].isin(ids_to_ignore)]
+    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
+    logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
+    logger.info("Non-relevant ObjectIDs removed successfully")
    
    return data

@ -80,14 +79,13 @@ def filter_activities_per_obj_id(
 ) -> tuple[DataFrame, Series]:
    data = data.copy()
    # filter only relevant activities count occurrences for each ObjectID
-    #relevant_activity_types = list(relevant_activity_types) # TODO: check deletion
+    logger.info("Filtering activities per ObjectID")
    filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
    data_filter_activities = data.loc[filt_rel_activities].copy()
    num_activities_per_obj_id = cast(
        Series,
        data_filter_activities[feature_obj_id].value_counts(sort=True)
    )
-    
    # filter for ObjectIDs with more than given number of activities
    filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
    # index of series contains ObjectIDs
@ -97,6 +95,7 @@ def filter_activities_per_obj_id(
    
    num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
    data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
+    logger.info("Activities per ObjectID filtered successfully")
    
    return data_filter_activities, num_activities_per_obj_id

@ -109,7 +108,7 @@ def generate_model_input(
        'VorgangsBeschreibung',
    ),
 ) -> DataFrame:
-    
+    logger.info("Generating concatenation of model input features")
    data = data.copy()
    model_input_features = list(model_input_features)
    input_features = data[model_input_features].fillna('').astype(str)
@ -117,6 +116,7 @@ def generate_model_input(
        lambda x: ' - '.join(x),
        axis=1,
    )
+    logger.info("Model input generated successfully")
    
    return data

@ -133,16 +133,17 @@ def generate_model_input(
 def get_timeline_candidates_index(
    data: DataFrame,
    num_activities_per_obj_id: Series,
+    *,
    model: SentenceTransformer,
    cos_sim_threshold: float,
    feature_obj_id: str = 'ObjektID',
    model_input_feature: str = 'nlp_model_input',
-) -> Iterator[tuple[ObjectID, list[PandasIndex]]]:
+) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
    # already sorted ObjIDs (descending regarding number of activities)
    obj_ids = cast(Iterable[ObjectID],
                   num_activities_per_obj_id.index)
    
-    for obj_id in obj_ids:
+    for obj_id in tqdm(obj_ids):
        data_per_obj_id = cast(
            DataFrame,
            data.loc[data[feature_obj_id]==obj_id]
@ -220,7 +221,58 @@ def candidates_by_index(
        yield idx_pair


-"""
-next part:
+def transform_timeline_candidates(
+    candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
+) -> TimelineCandidates:
+    """function to build a mapping of ObjectIDs to their respective collection of
+    timeline candidates (as tuple), each candidate group is separated as distinct 
+    tuple within this outer tuple

-"""
+    Parameters
+    ----------
+    candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
+        Iterator provided by ``get_timeline_candidates_index``
+
+    Returns
+    -------
+    dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
+        dictionary: ObjectID -> tuple of candidate groups
+    """
+    
+    candidates_by_obj_id: TimelineCandidates = {}
+    
+    obj_id_target: ObjectID | None = None
+    collection: list[tuple[PandasIndex, ...]] = []
+    
+    for obj_id, cands in candidates:
+        if obj_id_target is None:
+            collection = []
+            obj_id_target = obj_id
+        elif obj_id_target != obj_id:
+            candidates_by_obj_id[obj_id_target] = tuple(collection)
+            collection = []
+            obj_id_target = obj_id
+        collection.append(cands)
+    
+    if collection and obj_id_target is not None:
+        candidates_by_obj_id[obj_id_target] = tuple(collection)
+    
+    return candidates_by_obj_id
+
+def map_obj_texts(
+    data: DataFrame,
+    obj_ids: Iterable[ObjectID],
+) -> dict[ObjectID, str]:
+    obj_id_to_text: dict[ObjectID, str] = {}
+    
+    for obj_id in obj_ids:
+        data_per_obj = cast(
+            DataFrame,
+            data.loc[data['ObjektID']==obj_id]
+        )
+        # just take first entry
+        obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
+        obj_text = obj_text.strip(r' ,.:')
+        obj_id_to_text[obj_id] = obj_text
+    
+    return obj_id_to_text
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@ -1,6 +1,4 @@
 from typing import cast
-import sys
-import logging
 import re
 from itertools import combinations
 from collections.abc import Iterator
@ -12,6 +10,7 @@ from spacy.lang.de import German as GermanSpacyModel
 from pandas import DataFrame
 from tqdm.auto import tqdm

+from lang_main.loggers import logger_token_analysis as logger
 from lang_main.analysis.graphs import (
    update_graph,
    TokenGraph,
@ -19,9 +18,9 @@ from lang_main.analysis.graphs import (


 # ** Logging
-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.token_analysis')
+#LOGGING_LEVEL = 'INFO'
+#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
+#logger = logging.getLogger('ihm_analyse.token_analysis')

 # ** POS
 #POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -0,0 +1,24 @@
+from typing import Final
+import logging
+
+from lang_main.types import LoggingLevels
+
+LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
+LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
+LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
+LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
+LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
+LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
+
+logger_shared_helpers = logging.getLogger('lang_main.shared')
+logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
+logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')
+logger_graphs = logging.getLogger('lang_main.analysis.graphs')
+logger_graphs.setLevel(LOGGING_LEVEL_GRAPHS)
+logger_timeline = logging.getLogger('lang_main.analysis.timeline')
+logger_timeline.setLevel(LOGGING_LEVEL_TIMELINE)
+logger_token_analysis = logging.getLogger('lang_main.analysis.tokens')
+logger_token_analysis.setLevel(LOGGING_LEVEL_TOKEN_ANALYSIS)
+logger_preprocess.setLevel(LOGGING_LEVEL_PREPROCESS)
+logger_pipelines = logging.getLogger('lang_main.pipelines')
+logger_pipelines.setLevel(LOGGING_LEVEL_PIPELINES)
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -5,14 +5,9 @@ import logging
 from collections.abc import Callable
 from pathlib import Path

+from lang_main.loggers import logger_pipelines as logger
 from lang_main.shared import save_pickle, load_pickle

-
-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.pipelines')
-
-
 # ** pipelines to perform given actions on dataset in a customisable manner

 class NoPerformableActionError(Exception):
@ -94,8 +89,9 @@ class BasePipeline():
        self,
        filename: str,
    ) -> None:
-        target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename + '.pickle'
+        target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
        target_path = self.working_dir.joinpath(target_filename)
+        target_path = target_path.with_suffix('.pkl')
        # saving file locally
        save_pickle(obj=self._intermediate_result, path=target_path)
    
@ -104,7 +100,7 @@ class BasePipeline():
        saving_path: str,
        filename: str,
    ) -> tuple[Any, ...]:
-        target_path = saving_path + filename + '.pickle'
+        target_path = Path(saving_path + filename).with_suffix('.pkl')
        # loading DataFrame or Series from pickle
        data = load_pickle(target_path)
        
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -22,15 +22,6 @@ from lang_main.analysis.preprocessing import (
 )
 from lang_main.analysis.tokens import build_token_graph

-"""
-# ** config parameters
-SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
-DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
-FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
-    CONFIG['export_filenames']['filename_cossim_filter_candidates']
-THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
-"""
-
 # ** pipeline configuration
 # ** target feature preparation
 pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
--- a/src/lang_main/shared.py
+++ b/src/lang_main/shared.py
@ -1,16 +1,11 @@
 from typing import Any
-import sys
 import os
 import shutil
-import logging
 import pickle
 import tomllib
 from pathlib import Path

-# ** Logging
-LOGGING_LEVEL = 'INFO'
-logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-logger = logging.getLogger('ihm_analyse.helpers')
+from lang_main.loggers import logger_shared_helpers as logger

 # ** Lib
 def create_saving_folder(
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,9 +1,19 @@
-from typing import TypeAlias
+from typing import TypeAlias, Literal

 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor

+LoggingLevels: TypeAlias = Literal[
+    'DEBUG',
+    'INFO', 
+    'WARNING',
+    'ERROR',
+    'CRITICAL',
+]
+
 PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
 Embedding: TypeAlias = SpacyDoc | Tensor
+
+TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@ -0,0 +1,159 @@
+from typing import cast
+
+from dash import (
+    Dash,
+    html,
+    dcc,
+    callback,
+    Output,
+    Input,
+    State,
+    dash_table,
+)
+import plotly.express as px
+import pandas as pd
+from pandas import DataFrame
+
+from lang_main import load_pickle
+from lang_main.types import TimelineCandidates, ObjectID
+
+#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
+
+# ** data
+data = cast(DataFrame, load_pickle('./data.pkl'))
+cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
+texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
+table_feats = [
+    'ErstellungsDatum',
+    'ErledigungsDatum',
+    'VorgangsTypName',
+    'VorgangsBeschreibung',
+]
+table_feats_dates = [
+    'ErstellungsDatum',
+    'ErledigungsDatum',
+]
+
+# ** graph config
+markers = {
+    'size': 12,
+    'color': 'yellow',
+    'line': {
+        'width': 2,
+        'color': 'red',
+    },
+}
+hover_data = {
+    'ErstellungsDatum': '|%d.%m.%Y',
+    'VorgangsBeschreibung': True,
+}
+
+
+app = Dash(prevent_initial_callbacks=True)
+
+app.layout = [
+    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
+    html.Div(children=[
+        html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
+        dcc.Dropdown(
+            list(cands.keys()),
+            id='dropdown-selection',
+            placeholder="ObjektID auswählen...",
+        )
+    ]),
+    html.Div(children=[
+        html.H3(id='object_text'),
+        dcc.Dropdown(id='choice-candidates'),
+        dcc.Graph(id='graph-output'),
+    ]),
+    html.Div(children=[
+        dash_table.DataTable(id='table-candidates')
+    ]),
+]
+
+@callback(
+    Output('object_text', 'children'),
+    Input('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_obj_text(obj_id):
+    obj_id = int(obj_id)
+    obj_text = texts[obj_id]
+    headline = f'HObjektText: {obj_text}'
+    return headline
+
+@callback(
+    Output('choice-candidates', 'options'),
+    Input('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_choice_candidates(obj_id):
+    obj_id = int(obj_id)
+    cands_obj_id = cands[obj_id]
+    choices = list(range(1, len(cands_obj_id)+1))
+    return choices
+
+@callback(
+    Output('graph-output', 'figure'),
+    Input('choice-candidates', 'value'),
+    State('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_timeline(index, obj_id):
+    obj_id = int(obj_id)
+    # title
+    obj_text = texts[obj_id]
+    title = f'HObjektText: {obj_text}'
+    # cands
+    cands_obj_id = cands[obj_id]
+    cands_choice = cands_obj_id[int(index)-1]
+    # data
+    df = data.loc[list(cands_choice)].sort_index()
+    # figure
+    fig = px.line(
+        data_frame=df,
+        x='ErstellungsDatum',
+        y='ObjektID',
+        title=title,
+        hover_data=hover_data,
+    )
+    fig.update_traces(
+        mode='markers+lines',
+        marker=markers,
+        marker_symbol='diamond'
+    )
+    fig.update_xaxes(
+        tickformat="%B\n%Y",
+        rangeslider_visible=True,
+    )
+    fig.update_yaxes(type='category')
+    fig.update_layout(hovermode="x unified")
+    return fig
+
+@callback(
+    [Output('table-candidates', 'data'),
+     Output('table-candidates', 'columns')],
+    Input('choice-candidates', 'value'),
+    State('dropdown-selection', 'value'),
+    prevent_initial_call=True,
+)
+def update_table_candidates(index, obj_id):
+    obj_id = int(obj_id)
+    # cands
+    cands_obj_id = cands[obj_id]
+    cands_choice = cands_obj_id[int(index)-1]
+    # data
+    df = data.loc[list(cands_choice)].sort_index()
+    df = (df
+          .filter(items=table_feats, axis=1)
+          .sort_values(by='ErstellungsDatum', ascending=True))
+    cols = [{"name": i, "id": i} for i in df.columns]
+    # convert dates to strings
+    for col in table_feats_dates:
+        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
+    
+    table_data = df.to_dict('records')
+    return table_data, cols
+
+if __name__ == '__main__':
+    app.run(debug=True)
--- a/test-notebooks/dashboard/data.pkl
+++ b/test-notebooks/dashboard/data.pkl
--- a/test-notebooks/dashboard/map_candidates.pkl
+++ b/test-notebooks/dashboard/map_candidates.pkl
--- a/test-notebooks/dashboard/map_texts.pkl
+++ b/test-notebooks/dashboard/map_texts.pkl
--- a/test-notebooks/timeline_analysis.ipynb
+++ b/test-notebooks/timeline_analysis.ipynb