enhanced timeline analysis

This commit is contained in:
Florian Förster 2024-05-22 18:11:46 +02:00
parent df16b29191
commit 5d2c97165a
18 changed files with 2789 additions and 75 deletions

175
pdm.lock generated
View File

@ -5,7 +5,7 @@
groups = ["default", "notebooks", "trials"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.1"
content_hash = "sha256:fc88dc465a3d04eb53b847d7b58db1e55ce8adb004489102cceea01fb52527dc"
content_hash = "sha256:7574154c6728ede3eaf76a8b1a3b5d4339fcc8f2dc8c41042401004b6583e151"
[[package]]
name = "annotated-types"
@ -182,6 +182,17 @@ files = [
{file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
]
[[package]]
name = "blinker"
version = "1.8.2"
requires_python = ">=3.8"
summary = "Fast, simple object-to-object and broadcast signaling"
groups = ["trials"]
files = [
{file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"},
{file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"},
]
[[package]]
name = "blis"
version = "0.7.11"
@ -220,7 +231,7 @@ name = "certifi"
version = "2024.2.2"
requires_python = ">=3.6"
summary = "Python package for providing Mozilla's CA Bundle."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
{file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
@ -265,7 +276,7 @@ name = "charset-normalizer"
version = "3.3.2"
requires_python = ">=3.7.0"
summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
{file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
@ -306,7 +317,7 @@ name = "click"
version = "8.1.7"
requires_python = ">=3.7"
summary = "Composable command line interface toolkit"
groups = ["default"]
groups = ["default", "trials"]
dependencies = [
"colorama; platform_system == \"Windows\"",
]
@ -331,7 +342,7 @@ name = "colorama"
version = "0.4.6"
requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
summary = "Cross-platform colored terminal text."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
marker = "platform_system == \"Windows\" or sys_platform == \"win32\""
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
@ -386,6 +397,61 @@ files = [
{file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"},
]
[[package]]
name = "dash"
version = "2.17.0"
requires_python = ">=3.8"
summary = "A Python framework for building reactive web-apps. Developed by Plotly."
groups = ["trials"]
dependencies = [
"Flask<3.1,>=1.0.4",
"Werkzeug<3.1",
"dash-core-components==2.0.0",
"dash-html-components==2.0.0",
"dash-table==5.0.0",
"importlib-metadata",
"nest-asyncio",
"plotly>=5.0.0",
"requests",
"retrying",
"setuptools",
"typing-extensions>=4.1.1",
]
files = [
{file = "dash-2.17.0-py3-none-any.whl", hash = "sha256:2421569023b2cd46ea2d4b2c14fe72c71b7436527a3102219b2265fa361e7c67"},
{file = "dash-2.17.0.tar.gz", hash = "sha256:d065cd88771e45d0485993be0d27565e08918cb7edd18e31ee1c5b41252fc2fa"},
]
[[package]]
name = "dash-core-components"
version = "2.0.0"
summary = "Core component suite for Dash"
groups = ["trials"]
files = [
{file = "dash_core_components-2.0.0-py3-none-any.whl", hash = "sha256:52b8e8cce13b18d0802ee3acbc5e888cb1248a04968f962d63d070400af2e346"},
{file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
]
[[package]]
name = "dash-html-components"
version = "2.0.0"
summary = "Vanilla HTML components for Dash"
groups = ["trials"]
files = [
{file = "dash_html_components-2.0.0-py3-none-any.whl", hash = "sha256:b42cc903713c9706af03b3f2548bda4be7307a7cf89b7d6eae3da872717d1b63"},
{file = "dash_html_components-2.0.0.tar.gz", hash = "sha256:8703a601080f02619a6390998e0b3da4a5daabe97a1fd7a9cebc09d015f26e50"},
]
[[package]]
name = "dash-table"
version = "5.0.0"
summary = "Dash table"
groups = ["trials"]
files = [
{file = "dash_table-5.0.0-py3-none-any.whl", hash = "sha256:19036fa352bb1c11baf38068ec62d172f0515f73ca3276c79dee49b95ddc16c9"},
{file = "dash_table-5.0.0.tar.gz", hash = "sha256:18624d693d4c8ef2ddec99a6f167593437a7ea0bf153aa20f318c170c5bc7308"},
]
[[package]]
name = "debugpy"
version = "1.8.1"
@ -459,6 +525,24 @@ files = [
{file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"},
]
[[package]]
name = "flask"
version = "3.0.3"
requires_python = ">=3.8"
summary = "A simple framework for building complex web applications."
groups = ["trials"]
dependencies = [
"Jinja2>=3.1.2",
"Werkzeug>=3.0.0",
"blinker>=1.6.2",
"click>=8.1.3",
"itsdangerous>=2.1.2",
]
files = [
{file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"},
{file = "flask-3.0.3.tar.gz", hash = "sha256:ceb27b0af3823ea2737928a4d99d125a06175b8512c445cbd9a9ce200ef76842"},
]
[[package]]
name = "fqdn"
version = "1.5.1"
@ -550,12 +634,26 @@ name = "idna"
version = "3.7"
requires_python = ">=3.5"
summary = "Internationalized Domain Names in Applications (IDNA)"
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
]
[[package]]
name = "importlib-metadata"
version = "7.1.0"
requires_python = ">=3.8"
summary = "Read metadata from Python packages"
groups = ["trials"]
dependencies = [
"zipp>=0.5",
]
files = [
{file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
{file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
]
[[package]]
name = "intel-openmp"
version = "2021.4.0"
@ -651,6 +749,17 @@ files = [
{file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"},
]
[[package]]
name = "itsdangerous"
version = "2.2.0"
requires_python = ">=3.8"
summary = "Safely pass data to untrusted environments and back."
groups = ["trials"]
files = [
{file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
{file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
]
[[package]]
name = "jedi"
version = "0.19.1"
@ -670,7 +779,7 @@ name = "jinja2"
version = "3.1.4"
requires_python = ">=3.7"
summary = "A very fast and expressive template engine."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
dependencies = [
"MarkupSafe>=2.0",
]
@ -1038,7 +1147,7 @@ name = "markupsafe"
version = "2.1.5"
requires_python = ">=3.7"
summary = "Safely add untrusted strings to HTML/XML markup."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
{file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
@ -1203,7 +1312,7 @@ name = "nest-asyncio"
version = "1.6.0"
requires_python = ">=3.5"
summary = "Patch asyncio to allow nested event loops"
groups = ["notebooks"]
groups = ["notebooks", "trials"]
files = [
{file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
{file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
@ -1974,7 +2083,7 @@ name = "requests"
version = "2.31.0"
requires_python = ">=3.7"
summary = "Python HTTP for Humans."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
dependencies = [
"certifi>=2017.4.17",
"charset-normalizer<4,>=2",
@ -1986,6 +2095,19 @@ files = [
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
]
[[package]]
name = "retrying"
version = "1.3.4"
summary = "Retrying"
groups = ["trials"]
dependencies = [
"six>=1.7.0",
]
files = [
{file = "retrying-1.3.4-py3-none-any.whl", hash = "sha256:8cc4d43cb8e1125e0ff3344e9de678fefd85db3b750b81b2240dc0183af37b35"},
{file = "retrying-1.3.4.tar.gz", hash = "sha256:345da8c5765bd982b1d1915deb9102fd3d1f7ad16bd84a9700b85f64d24e8f3e"},
]
[[package]]
name = "rfc3339-validator"
version = "0.1.4"
@ -2229,7 +2351,7 @@ name = "setuptools"
version = "69.5.1"
requires_python = ">=3.8"
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
groups = ["default"]
groups = ["default", "trials"]
files = [
{file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
{file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
@ -2240,7 +2362,7 @@ name = "six"
version = "1.16.0"
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
summary = "Python 2 and 3 compatibility utilities"
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@ -2806,7 +2928,7 @@ name = "typing-extensions"
version = "4.11.0"
requires_python = ">=3.8"
summary = "Backported and Experimental Type Hints for Python 3.8+"
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
{file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
@ -2839,7 +2961,7 @@ name = "urllib3"
version = "2.2.1"
requires_python = ">=3.8"
summary = "HTTP library with thread-safe connection pooling, file post, and more."
groups = ["default", "notebooks"]
groups = ["default", "notebooks", "trials"]
files = [
{file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
{file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
@ -2923,6 +3045,20 @@ files = [
{file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
]
[[package]]
name = "werkzeug"
version = "3.0.3"
requires_python = ">=3.8"
summary = "The comprehensive WSGI web application library."
groups = ["trials"]
dependencies = [
"MarkupSafe>=2.1.1",
]
files = [
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
]
[[package]]
name = "widgetsnbextension"
version = "4.0.10"
@ -2933,3 +3069,14 @@ files = [
{file = "widgetsnbextension-4.0.10-py3-none-any.whl", hash = "sha256:d37c3724ec32d8c48400a435ecfa7d3e259995201fbefa37163124a9fcb393cc"},
{file = "widgetsnbextension-4.0.10.tar.gz", hash = "sha256:64196c5ff3b9a9183a8e699a4227fb0b7002f252c814098e66c4d1cd0644688f"},
]
[[package]]
name = "zipp"
version = "3.18.2"
requires_python = ">=3.8"
summary = "Backport of pathlib-compatible object wrapper for zip files"
groups = ["trials"]
files = [
{file = "zipp-3.18.2-py3-none-any.whl", hash = "sha256:dce197b859eb796242b0622af1b8beb0a722d52aa2f57133ead08edd5bf5374e"},
{file = "zipp-3.18.2.tar.gz", hash = "sha256:6278d9ddbcfb1f1089a88fde84481528b07b0e10474e09dcfe53dad4069fa059"},
]

View File

@ -32,4 +32,5 @@ notebooks = [
]
trials = [
"plotly>=5.22.0",
"dash>=2.17.0",
]

View File

@ -1,5 +1,8 @@
from typing import Final, Any
import inspect
import sys
import logging
from time import gmtime
from pathlib import Path
from lang_main.shared import (
@ -11,7 +14,6 @@ from lang_main.shared import (
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.analysis.graphs import TokenGraph
__all__ = [
'save_pickle',
'load_pickle',
@ -21,6 +23,15 @@ __all__ = [
'TokenGraph',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
format=LOG_FMT,
datefmt=LOG_DATE_FMT,
)
USE_INTERNAL_CONFIG: Final[bool] = True
# load config data: internal/external

View File

@ -1,7 +1,6 @@
import typing
from typing import Any, Self, Literal, overload, Final
import sys
import logging
from collections.abc import Hashable
from pathlib import Path
import copy
@ -12,14 +11,12 @@ from networkx import Graph, DiGraph
import networkx as nx
from pandas import DataFrame
from lang_main.loggers import logger_graphs as logger
from lang_main.shared import save_pickle, load_pickle
# TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.graphs')
def get_graph_metadata(
graph: Graph | DiGraph,

View File

@ -1,7 +1,5 @@
from typing import cast, Callable
from collections.abc import Iterable
import sys
import logging
from itertools import combinations
import re
from math import factorial
@ -19,6 +17,7 @@ import sentence_transformers.util
from tqdm import tqdm
from lang_main.types import Embedding, PandasIndex
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.shared import (
similar_index_connection_graph,
@ -27,10 +26,6 @@ from lang_main.analysis.shared import (
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.preprocess')
# ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple
# duplicate cleansing based on all properties
@ -436,6 +431,7 @@ def merge_similarity_dupl(
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group,:]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry

View File

@ -19,16 +19,17 @@ def similar_index_connection_graph(
# inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info
# TODO check returning tuple
def similar_index_groups(
similar_id_graph: Graph,
) -> Iterator[list[PandasIndex]]:
) -> Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]],
nx.connected_components(G=similar_id_graph))
for id_group in ids_groups:
yield list(id_group)
yield tuple(id_group)

View File

@ -1,6 +1,4 @@
from typing import cast
import sys
import logging
from collections.abc import Iterable, Iterator
import numpy as np
@ -12,16 +10,13 @@ import sentence_transformers
import sentence_transformers.util
from tqdm.auto import tqdm # TODO: check deletion
from lang_main.types import PandasIndex, ObjectID
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
from lang_main.loggers import logger_timeline as logger
from lang_main.analysis.shared import (
similar_index_connection_graph,
similar_index_groups,
)
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.time_analysis')
def non_relevant_obj_ids(
data: DataFrame,
@ -42,6 +37,8 @@ def non_relevant_obj_ids(
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
)
# check for uniqueness of given feature for current ObjectID
# ignore NaN values
feats_per_obj_id = feats_per_obj_id.dropna()
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
@ -56,7 +53,7 @@ def remove_non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID',
) -> DataFrame:
logger.info("Removing non-relevant ObjectIDs from dataset")
data = data.copy()
ids_to_ignore = non_relevant_obj_ids(
data=data,
@ -65,7 +62,9 @@ def remove_non_relevant_obj_ids(
feature_obj_id=feature_obj_id,
)
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~data[feature_obj_id].isin(ids_to_ignore)]
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
logger.info("Non-relevant ObjectIDs removed successfully")
return data
@ -80,14 +79,13 @@ def filter_activities_per_obj_id(
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
#relevant_activity_types = list(relevant_activity_types) # TODO: check deletion
logger.info("Filtering activities per ObjectID")
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series,
data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
# index of series contains ObjectIDs
@ -97,6 +95,7 @@ def filter_activities_per_obj_id(
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info("Activities per ObjectID filtered successfully")
return data_filter_activities, num_activities_per_obj_id
@ -109,7 +108,7 @@ def generate_model_input(
'VorgangsBeschreibung',
),
) -> DataFrame:
logger.info("Generating concatenation of model input features")
data = data.copy()
model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str)
@ -117,6 +116,7 @@ def generate_model_input(
lambda x: ' - '.join(x),
axis=1,
)
logger.info("Model input generated successfully")
return data
@ -133,16 +133,17 @@ def generate_model_input(
def get_timeline_candidates_index(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
model: SentenceTransformer,
cos_sim_threshold: float,
feature_obj_id: str = 'ObjektID',
model_input_feature: str = 'nlp_model_input',
) -> Iterator[tuple[ObjectID, list[PandasIndex]]]:
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
# already sorted ObjIDs (descending regarding number of activities)
obj_ids = cast(Iterable[ObjectID],
num_activities_per_obj_id.index)
for obj_id in obj_ids:
for obj_id in tqdm(obj_ids):
data_per_obj_id = cast(
DataFrame,
data.loc[data[feature_obj_id]==obj_id]
@ -220,7 +221,58 @@ def candidates_by_index(
yield idx_pair
"""
next part:
def transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates:
"""function to build a mapping of ObjectIDs to their respective collection of
timeline candidates (as tuple), each candidate group is separated as distinct
tuple within this outer tuple
"""
Parameters
----------
candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
Iterator provided by ``get_timeline_candidates_index``
Returns
-------
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
dictionary: ObjectID -> tuple of candidate groups
"""
candidates_by_obj_id: TimelineCandidates = {}
obj_id_target: ObjectID | None = None
collection: list[tuple[PandasIndex, ...]] = []
for obj_id, cands in candidates:
if obj_id_target is None:
collection = []
obj_id_target = obj_id
elif obj_id_target != obj_id:
candidates_by_obj_id[obj_id_target] = tuple(collection)
collection = []
obj_id_target = obj_id
collection.append(cands)
if collection and obj_id_target is not None:
candidates_by_obj_id[obj_id_target] = tuple(collection)
return candidates_by_obj_id
def map_obj_texts(
data: DataFrame,
obj_ids: Iterable[ObjectID],
) -> dict[ObjectID, str]:
obj_id_to_text: dict[ObjectID, str] = {}
for obj_id in obj_ids:
data_per_obj = cast(
DataFrame,
data.loc[data['ObjektID']==obj_id]
)
# just take first entry
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
obj_text = obj_text.strip(r' ,.:')
obj_id_to_text[obj_id] = obj_text
return obj_id_to_text

View File

@ -1,6 +1,4 @@
from typing import cast
import sys
import logging
import re
from itertools import combinations
from collections.abc import Iterator
@ -12,6 +10,7 @@ from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame
from tqdm.auto import tqdm
from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import (
update_graph,
TokenGraph,
@ -19,9 +18,9 @@ from lang_main.analysis.graphs import (
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.token_analysis')
#LOGGING_LEVEL = 'INFO'
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
#logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])

24
src/lang_main/loggers.py Normal file
View File

@ -0,0 +1,24 @@
from typing import Final
import logging
from lang_main.types import LoggingLevels
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')
logger_graphs = logging.getLogger('lang_main.analysis.graphs')
logger_graphs.setLevel(LOGGING_LEVEL_GRAPHS)
logger_timeline = logging.getLogger('lang_main.analysis.timeline')
logger_timeline.setLevel(LOGGING_LEVEL_TIMELINE)
logger_token_analysis = logging.getLogger('lang_main.analysis.tokens')
logger_token_analysis.setLevel(LOGGING_LEVEL_TOKEN_ANALYSIS)
logger_preprocess.setLevel(LOGGING_LEVEL_PREPROCESS)
logger_pipelines = logging.getLogger('lang_main.pipelines')
logger_pipelines.setLevel(LOGGING_LEVEL_PIPELINES)

View File

@ -5,14 +5,9 @@ import logging
from collections.abc import Callable
from pathlib import Path
from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import save_pickle, load_pickle
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.pipelines')
# ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception):
@ -94,8 +89,9 @@ class BasePipeline():
self,
filename: str,
) -> None:
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename + '.pickle'
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
target_path = self.working_dir.joinpath(target_filename)
target_path = target_path.with_suffix('.pkl')
# saving file locally
save_pickle(obj=self._intermediate_result, path=target_path)
@ -104,7 +100,7 @@ class BasePipeline():
saving_path: str,
filename: str,
) -> tuple[Any, ...]:
target_path = saving_path + filename + '.pickle'
target_path = Path(saving_path + filename).with_suffix('.pkl')
# loading DataFrame or Series from pickle
data = load_pickle(target_path)

View File

@ -22,15 +22,6 @@ from lang_main.analysis.preprocessing import (
)
from lang_main.analysis.tokens import build_token_graph
"""
# ** config parameters
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
CONFIG['export_filenames']['filename_cossim_filter_candidates']
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
"""
# ** pipeline configuration
# ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)

View File

@ -1,16 +1,11 @@
from typing import Any
import sys
import os
import shutil
import logging
import pickle
import tomllib
from pathlib import Path
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.helpers')
from lang_main.loggers import logger_shared_helpers as logger
# ** Lib
def create_saving_folder(

View File

@ -1,9 +1,19 @@
from typing import TypeAlias
from typing import TypeAlias, Literal
import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
LoggingLevels: TypeAlias = Literal[
'DEBUG',
'INFO',
'WARNING',
'ERROR',
'CRITICAL',
]
PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int
Embedding: TypeAlias = SpacyDoc | Tensor
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]

View File

@ -0,0 +1,159 @@
from typing import cast
from dash import (
Dash,
html,
dcc,
callback,
Output,
Input,
State,
dash_table,
)
import plotly.express as px
import pandas as pd
from pandas import DataFrame
from lang_main import load_pickle
from lang_main.types import TimelineCandidates, ObjectID
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
data = cast(DataFrame, load_pickle('./data.pkl'))
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
table_feats = [
'ErstellungsDatum',
'ErledigungsDatum',
'VorgangsTypName',
'VorgangsBeschreibung',
]
table_feats_dates = [
'ErstellungsDatum',
'ErledigungsDatum',
]
# ** graph config
markers = {
'size': 12,
'color': 'yellow',
'line': {
'width': 2,
'color': 'red',
},
}
hover_data = {
'ErstellungsDatum': '|%d.%m.%Y',
'VorgangsBeschreibung': True,
}
app = Dash(prevent_initial_callbacks=True)
app.layout = [
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
html.Div(children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder="ObjektID auswählen...",
)
]),
html.Div(children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]),
html.Div(children=[
dash_table.DataTable(id='table-candidates')
]),
]
@callback(
Output('object_text', 'children'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_obj_text(obj_id):
obj_id = int(obj_id)
obj_text = texts[obj_id]
headline = f'HObjektText: {obj_text}'
return headline
@callback(
Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_choice_candidates(obj_id):
obj_id = int(obj_id)
cands_obj_id = cands[obj_id]
choices = list(range(1, len(cands_obj_id)+1))
return choices
@callback(
Output('graph-output', 'figure'),
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_timeline(index, obj_id):
obj_id = int(obj_id)
# title
obj_text = texts[obj_id]
title = f'HObjektText: {obj_text}'
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
# data
df = data.loc[list(cands_choice)].sort_index()
# figure
fig = px.line(
data_frame=df,
x='ErstellungsDatum',
y='ObjektID',
title=title,
hover_data=hover_data,
)
fig.update_traces(
mode='markers+lines',
marker=markers,
marker_symbol='diamond'
)
fig.update_xaxes(
tickformat="%B\n%Y",
rangeslider_visible=True,
)
fig.update_yaxes(type='category')
fig.update_layout(hovermode="x unified")
return fig
@callback(
[Output('table-candidates', 'data'),
Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
)
def update_table_candidates(index, obj_id):
obj_id = int(obj_id)
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
# data
df = data.loc[list(cands_choice)].sort_index()
df = (df
.filter(items=table_feats, axis=1)
.sort_values(by='ErstellungsDatum', ascending=True))
cols = [{"name": i, "id": i} for i in df.columns]
# convert dates to strings
for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records')
return table_data, cols
if __name__ == '__main__':
app.run(debug=True)

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long