enhanced timeline analysis
This commit is contained in:
parent
df16b29191
commit
5d2c97165a
175
pdm.lock
generated
175
pdm.lock
generated
@ -5,7 +5,7 @@
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
strategy = ["cross_platform", "inherit_metadata"]
|
||||
lock_version = "4.4.1"
|
||||
content_hash = "sha256:fc88dc465a3d04eb53b847d7b58db1e55ce8adb004489102cceea01fb52527dc"
|
||||
content_hash = "sha256:7574154c6728ede3eaf76a8b1a3b5d4339fcc8f2dc8c41042401004b6583e151"
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
@ -182,6 +182,17 @@ files = [
|
||||
{file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "blinker"
|
||||
version = "1.8.2"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Fast, simple object-to-object and broadcast signaling"
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"},
|
||||
{file = "blinker-1.8.2.tar.gz", hash = "sha256:8f77b09d3bf7c795e969e9486f39c2c5e9c39d4ee07424be2bc594ece9642d83"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "blis"
|
||||
version = "0.7.11"
|
||||
@ -220,7 +231,7 @@ name = "certifi"
|
||||
version = "2024.2.2"
|
||||
requires_python = ">=3.6"
|
||||
summary = "Python package for providing Mozilla's CA Bundle."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
|
||||
{file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
|
||||
@ -265,7 +276,7 @@ name = "charset-normalizer"
|
||||
version = "3.3.2"
|
||||
requires_python = ">=3.7.0"
|
||||
summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
|
||||
{file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
|
||||
@ -306,7 +317,7 @@ name = "click"
|
||||
version = "8.1.7"
|
||||
requires_python = ">=3.7"
|
||||
summary = "Composable command line interface toolkit"
|
||||
groups = ["default"]
|
||||
groups = ["default", "trials"]
|
||||
dependencies = [
|
||||
"colorama; platform_system == \"Windows\"",
|
||||
]
|
||||
@ -331,7 +342,7 @@ name = "colorama"
|
||||
version = "0.4.6"
|
||||
requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
summary = "Cross-platform colored terminal text."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
marker = "platform_system == \"Windows\" or sys_platform == \"win32\""
|
||||
files = [
|
||||
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||
@ -386,6 +397,61 @@ files = [
|
||||
{file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dash"
|
||||
version = "2.17.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A Python framework for building reactive web-apps. Developed by Plotly."
|
||||
groups = ["trials"]
|
||||
dependencies = [
|
||||
"Flask<3.1,>=1.0.4",
|
||||
"Werkzeug<3.1",
|
||||
"dash-core-components==2.0.0",
|
||||
"dash-html-components==2.0.0",
|
||||
"dash-table==5.0.0",
|
||||
"importlib-metadata",
|
||||
"nest-asyncio",
|
||||
"plotly>=5.0.0",
|
||||
"requests",
|
||||
"retrying",
|
||||
"setuptools",
|
||||
"typing-extensions>=4.1.1",
|
||||
]
|
||||
files = [
|
||||
{file = "dash-2.17.0-py3-none-any.whl", hash = "sha256:2421569023b2cd46ea2d4b2c14fe72c71b7436527a3102219b2265fa361e7c67"},
|
||||
{file = "dash-2.17.0.tar.gz", hash = "sha256:d065cd88771e45d0485993be0d27565e08918cb7edd18e31ee1c5b41252fc2fa"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dash-core-components"
|
||||
version = "2.0.0"
|
||||
summary = "Core component suite for Dash"
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "dash_core_components-2.0.0-py3-none-any.whl", hash = "sha256:52b8e8cce13b18d0802ee3acbc5e888cb1248a04968f962d63d070400af2e346"},
|
||||
{file = "dash_core_components-2.0.0.tar.gz", hash = "sha256:c6733874af975e552f95a1398a16c2ee7df14ce43fa60bb3718a3c6e0b63ffee"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dash-html-components"
|
||||
version = "2.0.0"
|
||||
summary = "Vanilla HTML components for Dash"
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "dash_html_components-2.0.0-py3-none-any.whl", hash = "sha256:b42cc903713c9706af03b3f2548bda4be7307a7cf89b7d6eae3da872717d1b63"},
|
||||
{file = "dash_html_components-2.0.0.tar.gz", hash = "sha256:8703a601080f02619a6390998e0b3da4a5daabe97a1fd7a9cebc09d015f26e50"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dash-table"
|
||||
version = "5.0.0"
|
||||
summary = "Dash table"
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "dash_table-5.0.0-py3-none-any.whl", hash = "sha256:19036fa352bb1c11baf38068ec62d172f0515f73ca3276c79dee49b95ddc16c9"},
|
||||
{file = "dash_table-5.0.0.tar.gz", hash = "sha256:18624d693d4c8ef2ddec99a6f167593437a7ea0bf153aa20f318c170c5bc7308"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "debugpy"
|
||||
version = "1.8.1"
|
||||
@ -459,6 +525,24 @@ files = [
|
||||
{file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flask"
|
||||
version = "3.0.3"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A simple framework for building complex web applications."
|
||||
groups = ["trials"]
|
||||
dependencies = [
|
||||
"Jinja2>=3.1.2",
|
||||
"Werkzeug>=3.0.0",
|
||||
"blinker>=1.6.2",
|
||||
"click>=8.1.3",
|
||||
"itsdangerous>=2.1.2",
|
||||
]
|
||||
files = [
|
||||
{file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"},
|
||||
{file = "flask-3.0.3.tar.gz", hash = "sha256:ceb27b0af3823ea2737928a4d99d125a06175b8512c445cbd9a9ce200ef76842"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fqdn"
|
||||
version = "1.5.1"
|
||||
@ -550,12 +634,26 @@ name = "idna"
|
||||
version = "3.7"
|
||||
requires_python = ">=3.5"
|
||||
summary = "Internationalized Domain Names in Applications (IDNA)"
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
|
||||
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "importlib-metadata"
|
||||
version = "7.1.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Read metadata from Python packages"
|
||||
groups = ["trials"]
|
||||
dependencies = [
|
||||
"zipp>=0.5",
|
||||
]
|
||||
files = [
|
||||
{file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
|
||||
{file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "intel-openmp"
|
||||
version = "2021.4.0"
|
||||
@ -651,6 +749,17 @@ files = [
|
||||
{file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itsdangerous"
|
||||
version = "2.2.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Safely pass data to untrusted environments and back."
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
|
||||
{file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jedi"
|
||||
version = "0.19.1"
|
||||
@ -670,7 +779,7 @@ name = "jinja2"
|
||||
version = "3.1.4"
|
||||
requires_python = ">=3.7"
|
||||
summary = "A very fast and expressive template engine."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
dependencies = [
|
||||
"MarkupSafe>=2.0",
|
||||
]
|
||||
@ -1038,7 +1147,7 @@ name = "markupsafe"
|
||||
version = "2.1.5"
|
||||
requires_python = ">=3.7"
|
||||
summary = "Safely add untrusted strings to HTML/XML markup."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
|
||||
{file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
|
||||
@ -1203,7 +1312,7 @@ name = "nest-asyncio"
|
||||
version = "1.6.0"
|
||||
requires_python = ">=3.5"
|
||||
summary = "Patch asyncio to allow nested event loops"
|
||||
groups = ["notebooks"]
|
||||
groups = ["notebooks", "trials"]
|
||||
files = [
|
||||
{file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
|
||||
{file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
|
||||
@ -1974,7 +2083,7 @@ name = "requests"
|
||||
version = "2.31.0"
|
||||
requires_python = ">=3.7"
|
||||
summary = "Python HTTP for Humans."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
dependencies = [
|
||||
"certifi>=2017.4.17",
|
||||
"charset-normalizer<4,>=2",
|
||||
@ -1986,6 +2095,19 @@ files = [
|
||||
{file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "retrying"
|
||||
version = "1.3.4"
|
||||
summary = "Retrying"
|
||||
groups = ["trials"]
|
||||
dependencies = [
|
||||
"six>=1.7.0",
|
||||
]
|
||||
files = [
|
||||
{file = "retrying-1.3.4-py3-none-any.whl", hash = "sha256:8cc4d43cb8e1125e0ff3344e9de678fefd85db3b750b81b2240dc0183af37b35"},
|
||||
{file = "retrying-1.3.4.tar.gz", hash = "sha256:345da8c5765bd982b1d1915deb9102fd3d1f7ad16bd84a9700b85f64d24e8f3e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rfc3339-validator"
|
||||
version = "0.1.4"
|
||||
@ -2229,7 +2351,7 @@ name = "setuptools"
|
||||
version = "69.5.1"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
groups = ["default"]
|
||||
groups = ["default", "trials"]
|
||||
files = [
|
||||
{file = "setuptools-69.5.1-py3-none-any.whl", hash = "sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32"},
|
||||
{file = "setuptools-69.5.1.tar.gz", hash = "sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987"},
|
||||
@ -2240,7 +2362,7 @@ name = "six"
|
||||
version = "1.16.0"
|
||||
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
summary = "Python 2 and 3 compatibility utilities"
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
|
||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
@ -2806,7 +2928,7 @@ name = "typing-extensions"
|
||||
version = "4.11.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Backported and Experimental Type Hints for Python 3.8+"
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
|
||||
{file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
|
||||
@ -2839,7 +2961,7 @@ name = "urllib3"
|
||||
version = "2.2.1"
|
||||
requires_python = ">=3.8"
|
||||
summary = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
groups = ["default", "notebooks"]
|
||||
groups = ["default", "notebooks", "trials"]
|
||||
files = [
|
||||
{file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
|
||||
{file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
|
||||
@ -2923,6 +3045,20 @@ files = [
|
||||
{file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.3"
|
||||
requires_python = ">=3.8"
|
||||
summary = "The comprehensive WSGI web application library."
|
||||
groups = ["trials"]
|
||||
dependencies = [
|
||||
"MarkupSafe>=2.1.1",
|
||||
]
|
||||
files = [
|
||||
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "widgetsnbextension"
|
||||
version = "4.0.10"
|
||||
@ -2933,3 +3069,14 @@ files = [
|
||||
{file = "widgetsnbextension-4.0.10-py3-none-any.whl", hash = "sha256:d37c3724ec32d8c48400a435ecfa7d3e259995201fbefa37163124a9fcb393cc"},
|
||||
{file = "widgetsnbextension-4.0.10.tar.gz", hash = "sha256:64196c5ff3b9a9183a8e699a4227fb0b7002f252c814098e66c4d1cd0644688f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zipp"
|
||||
version = "3.18.2"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Backport of pathlib-compatible object wrapper for zip files"
|
||||
groups = ["trials"]
|
||||
files = [
|
||||
{file = "zipp-3.18.2-py3-none-any.whl", hash = "sha256:dce197b859eb796242b0622af1b8beb0a722d52aa2f57133ead08edd5bf5374e"},
|
||||
{file = "zipp-3.18.2.tar.gz", hash = "sha256:6278d9ddbcfb1f1089a88fde84481528b07b0e10474e09dcfe53dad4069fa059"},
|
||||
]
|
||||
|
||||
@ -32,4 +32,5 @@ notebooks = [
|
||||
]
|
||||
trials = [
|
||||
"plotly>=5.22.0",
|
||||
"dash>=2.17.0",
|
||||
]
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
from typing import Final, Any
|
||||
import inspect
|
||||
import sys
|
||||
import logging
|
||||
from time import gmtime
|
||||
from pathlib import Path
|
||||
|
||||
from lang_main.shared import (
|
||||
@ -11,7 +14,6 @@ from lang_main.shared import (
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
|
||||
|
||||
__all__ = [
|
||||
'save_pickle',
|
||||
'load_pickle',
|
||||
@ -21,6 +23,15 @@ __all__ = [
|
||||
'TokenGraph',
|
||||
]
|
||||
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout,
|
||||
format=LOG_FMT,
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||
|
||||
# load config data: internal/external
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import typing
|
||||
from typing import Any, Self, Literal, overload, Final
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Hashable
|
||||
from pathlib import Path
|
||||
import copy
|
||||
@ -12,14 +11,12 @@ from networkx import Graph, DiGraph
|
||||
import networkx as nx
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
|
||||
# TODO change logging behaviour, add logging to file
|
||||
LOGGING_DEFAULT: Final[bool] = False
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.graphs')
|
||||
|
||||
def get_graph_metadata(
|
||||
graph: Graph | DiGraph,
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
from typing import cast, Callable
|
||||
from collections.abc import Iterable
|
||||
import sys
|
||||
import logging
|
||||
from itertools import combinations
|
||||
import re
|
||||
from math import factorial
|
||||
@ -19,6 +17,7 @@ import sentence_transformers.util
|
||||
from tqdm import tqdm
|
||||
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.shared import (
|
||||
similar_index_connection_graph,
|
||||
@ -27,10 +26,6 @@ from lang_main.analysis.shared import (
|
||||
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.preprocess')
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
# following functions used to load a given dataset and perform simple
|
||||
# duplicate cleansing based on all properties
|
||||
@ -436,6 +431,7 @@ def merge_similarity_dupl(
|
||||
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
|
||||
|
||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||
similar_id_group = list(similar_id_group)
|
||||
similar_data = merged_data.loc[similar_id_group,:]
|
||||
# keep first entry with max number occurrences, then number of
|
||||
# associated objects, then length of entry
|
||||
|
||||
@ -19,16 +19,17 @@ def similar_index_connection_graph(
|
||||
# inplace operation, parent/child do not really exist in undirected graph
|
||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
||||
|
||||
return similar_id_graph, graph_info
|
||||
|
||||
# TODO check returning tuple
|
||||
def similar_index_groups(
|
||||
similar_id_graph: Graph,
|
||||
) -> Iterator[list[PandasIndex]]:
|
||||
) -> Iterator[tuple[PandasIndex, ...]]:
|
||||
# groups of connected indices
|
||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
nx.connected_components(G=similar_id_graph))
|
||||
|
||||
for id_group in ids_groups:
|
||||
yield list(id_group)
|
||||
yield tuple(id_group)
|
||||
@ -1,6 +1,4 @@
|
||||
from typing import cast
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Iterable, Iterator
|
||||
|
||||
import numpy as np
|
||||
@ -12,16 +10,13 @@ import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from tqdm.auto import tqdm # TODO: check deletion
|
||||
|
||||
from lang_main.types import PandasIndex, ObjectID
|
||||
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.analysis.shared import (
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.time_analysis')
|
||||
|
||||
def non_relevant_obj_ids(
|
||||
data: DataFrame,
|
||||
@ -42,6 +37,8 @@ def non_relevant_obj_ids(
|
||||
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
|
||||
)
|
||||
# check for uniqueness of given feature for current ObjectID
|
||||
# ignore NaN values
|
||||
feats_per_obj_id = feats_per_obj_id.dropna()
|
||||
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
|
||||
|
||||
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
|
||||
@ -56,7 +53,7 @@ def remove_non_relevant_obj_ids(
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> DataFrame:
|
||||
|
||||
logger.info("Removing non-relevant ObjectIDs from dataset")
|
||||
data = data.copy()
|
||||
ids_to_ignore = non_relevant_obj_ids(
|
||||
data=data,
|
||||
@ -65,7 +62,9 @@ def remove_non_relevant_obj_ids(
|
||||
feature_obj_id=feature_obj_id,
|
||||
)
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~data[feature_obj_id].isin(ids_to_ignore)]
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
|
||||
logger.info("Non-relevant ObjectIDs removed successfully")
|
||||
|
||||
return data
|
||||
|
||||
@ -80,14 +79,13 @@ def filter_activities_per_obj_id(
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
#relevant_activity_types = list(relevant_activity_types) # TODO: check deletion
|
||||
logger.info("Filtering activities per ObjectID")
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series,
|
||||
data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
|
||||
# index of series contains ObjectIDs
|
||||
@ -97,6 +95,7 @@ def filter_activities_per_obj_id(
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info("Activities per ObjectID filtered successfully")
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
@ -109,7 +108,7 @@ def generate_model_input(
|
||||
'VorgangsBeschreibung',
|
||||
),
|
||||
) -> DataFrame:
|
||||
|
||||
logger.info("Generating concatenation of model input features")
|
||||
data = data.copy()
|
||||
model_input_features = list(model_input_features)
|
||||
input_features = data[model_input_features].fillna('').astype(str)
|
||||
@ -117,6 +116,7 @@ def generate_model_input(
|
||||
lambda x: ' - '.join(x),
|
||||
axis=1,
|
||||
)
|
||||
logger.info("Model input generated successfully")
|
||||
|
||||
return data
|
||||
|
||||
@ -133,16 +133,17 @@ def generate_model_input(
|
||||
def get_timeline_candidates_index(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
*,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> Iterator[tuple[ObjectID, list[PandasIndex]]]:
|
||||
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
||||
# already sorted ObjIDs (descending regarding number of activities)
|
||||
obj_ids = cast(Iterable[ObjectID],
|
||||
num_activities_per_obj_id.index)
|
||||
|
||||
for obj_id in obj_ids:
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj_id = cast(
|
||||
DataFrame,
|
||||
data.loc[data[feature_obj_id]==obj_id]
|
||||
@ -220,7 +221,58 @@ def candidates_by_index(
|
||||
yield idx_pair
|
||||
|
||||
|
||||
"""
|
||||
next part:
|
||||
def transform_timeline_candidates(
|
||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||
) -> TimelineCandidates:
|
||||
"""function to build a mapping of ObjectIDs to their respective collection of
|
||||
timeline candidates (as tuple), each candidate group is separated as distinct
|
||||
tuple within this outer tuple
|
||||
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
|
||||
Iterator provided by ``get_timeline_candidates_index``
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
dictionary: ObjectID -> tuple of candidate groups
|
||||
"""
|
||||
|
||||
candidates_by_obj_id: TimelineCandidates = {}
|
||||
|
||||
obj_id_target: ObjectID | None = None
|
||||
collection: list[tuple[PandasIndex, ...]] = []
|
||||
|
||||
for obj_id, cands in candidates:
|
||||
if obj_id_target is None:
|
||||
collection = []
|
||||
obj_id_target = obj_id
|
||||
elif obj_id_target != obj_id:
|
||||
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
||||
collection = []
|
||||
obj_id_target = obj_id
|
||||
collection.append(cands)
|
||||
|
||||
if collection and obj_id_target is not None:
|
||||
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
||||
|
||||
return candidates_by_obj_id
|
||||
|
||||
def map_obj_texts(
|
||||
data: DataFrame,
|
||||
obj_ids: Iterable[ObjectID],
|
||||
) -> dict[ObjectID, str]:
|
||||
obj_id_to_text: dict[ObjectID, str] = {}
|
||||
|
||||
for obj_id in obj_ids:
|
||||
data_per_obj = cast(
|
||||
DataFrame,
|
||||
data.loc[data['ObjektID']==obj_id]
|
||||
)
|
||||
# just take first entry
|
||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||
obj_text = obj_text.strip(r' ,.:')
|
||||
obj_id_to_text[obj_id] = obj_text
|
||||
|
||||
return obj_id_to_text
|
||||
@ -1,6 +1,4 @@
|
||||
from typing import cast
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
from itertools import combinations
|
||||
from collections.abc import Iterator
|
||||
@ -12,6 +10,7 @@ from spacy.lang.de import German as GermanSpacyModel
|
||||
from pandas import DataFrame
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
from lang_main.analysis.graphs import (
|
||||
update_graph,
|
||||
TokenGraph,
|
||||
@ -19,9 +18,9 @@ from lang_main.analysis.graphs import (
|
||||
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
#LOGGING_LEVEL = 'INFO'
|
||||
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
#logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
|
||||
# ** POS
|
||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
|
||||
24
src/lang_main/loggers.py
Normal file
24
src/lang_main/loggers.py
Normal file
@ -0,0 +1,24 @@
|
||||
from typing import Final
|
||||
import logging
|
||||
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
|
||||
|
||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||
logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')
|
||||
logger_graphs = logging.getLogger('lang_main.analysis.graphs')
|
||||
logger_graphs.setLevel(LOGGING_LEVEL_GRAPHS)
|
||||
logger_timeline = logging.getLogger('lang_main.analysis.timeline')
|
||||
logger_timeline.setLevel(LOGGING_LEVEL_TIMELINE)
|
||||
logger_token_analysis = logging.getLogger('lang_main.analysis.tokens')
|
||||
logger_token_analysis.setLevel(LOGGING_LEVEL_TOKEN_ANALYSIS)
|
||||
logger_preprocess.setLevel(LOGGING_LEVEL_PREPROCESS)
|
||||
logger_pipelines = logging.getLogger('lang_main.pipelines')
|
||||
logger_pipelines.setLevel(LOGGING_LEVEL_PIPELINES)
|
||||
@ -5,14 +5,9 @@ import logging
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.pipelines')
|
||||
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
@ -94,8 +89,9 @@ class BasePipeline():
|
||||
self,
|
||||
filename: str,
|
||||
) -> None:
|
||||
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename + '.pickle'
|
||||
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
|
||||
target_path = self.working_dir.joinpath(target_filename)
|
||||
target_path = target_path.with_suffix('.pkl')
|
||||
# saving file locally
|
||||
save_pickle(obj=self._intermediate_result, path=target_path)
|
||||
|
||||
@ -104,7 +100,7 @@ class BasePipeline():
|
||||
saving_path: str,
|
||||
filename: str,
|
||||
) -> tuple[Any, ...]:
|
||||
target_path = saving_path + filename + '.pickle'
|
||||
target_path = Path(saving_path + filename).with_suffix('.pkl')
|
||||
# loading DataFrame or Series from pickle
|
||||
data = load_pickle(target_path)
|
||||
|
||||
|
||||
@ -22,15 +22,6 @@ from lang_main.analysis.preprocessing import (
|
||||
)
|
||||
from lang_main.analysis.tokens import build_token_graph
|
||||
|
||||
"""
|
||||
# ** config parameters
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
||||
CONFIG['export_filenames']['filename_cossim_filter_candidates']
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
"""
|
||||
|
||||
# ** pipeline configuration
|
||||
# ** target feature preparation
|
||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||
|
||||
@ -1,16 +1,11 @@
|
||||
from typing import Any
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import pickle
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.helpers')
|
||||
from lang_main.loggers import logger_shared_helpers as logger
|
||||
|
||||
# ** Lib
|
||||
def create_saving_folder(
|
||||
|
||||
@ -1,9 +1,19 @@
|
||||
from typing import TypeAlias
|
||||
from typing import TypeAlias, Literal
|
||||
|
||||
import numpy as np
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
|
||||
LoggingLevels: TypeAlias = Literal[
|
||||
'DEBUG',
|
||||
'INFO',
|
||||
'WARNING',
|
||||
'ERROR',
|
||||
'CRITICAL',
|
||||
]
|
||||
|
||||
PandasIndex: TypeAlias = int | np.int64
|
||||
ObjectID: TypeAlias = int
|
||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||
|
||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
159
test-notebooks/dashboard/app.py
Normal file
159
test-notebooks/dashboard/app.py
Normal file
@ -0,0 +1,159 @@
|
||||
from typing import cast
|
||||
|
||||
from dash import (
|
||||
Dash,
|
||||
html,
|
||||
dcc,
|
||||
callback,
|
||||
Output,
|
||||
Input,
|
||||
State,
|
||||
dash_table,
|
||||
)
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main import load_pickle
|
||||
from lang_main.types import TimelineCandidates, ObjectID
|
||||
|
||||
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||
|
||||
# ** data
|
||||
data = cast(DataFrame, load_pickle('./data.pkl'))
|
||||
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
|
||||
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
|
||||
table_feats = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
'VorgangsTypName',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
table_feats_dates = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
]
|
||||
|
||||
# ** graph config
|
||||
markers = {
|
||||
'size': 12,
|
||||
'color': 'yellow',
|
||||
'line': {
|
||||
'width': 2,
|
||||
'color': 'red',
|
||||
},
|
||||
}
|
||||
hover_data = {
|
||||
'ErstellungsDatum': '|%d.%m.%Y',
|
||||
'VorgangsBeschreibung': True,
|
||||
}
|
||||
|
||||
|
||||
app = Dash(prevent_initial_callbacks=True)
|
||||
|
||||
app.layout = [
|
||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
|
||||
html.Div(children=[
|
||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||
dcc.Dropdown(
|
||||
list(cands.keys()),
|
||||
id='dropdown-selection',
|
||||
placeholder="ObjektID auswählen...",
|
||||
)
|
||||
]),
|
||||
html.Div(children=[
|
||||
html.H3(id='object_text'),
|
||||
dcc.Dropdown(id='choice-candidates'),
|
||||
dcc.Graph(id='graph-output'),
|
||||
]),
|
||||
html.Div(children=[
|
||||
dash_table.DataTable(id='table-candidates')
|
||||
]),
|
||||
]
|
||||
|
||||
@callback(
|
||||
Output('object_text', 'children'),
|
||||
Input('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def update_obj_text(obj_id):
|
||||
obj_id = int(obj_id)
|
||||
obj_text = texts[obj_id]
|
||||
headline = f'HObjektText: {obj_text}'
|
||||
return headline
|
||||
|
||||
@callback(
|
||||
Output('choice-candidates', 'options'),
|
||||
Input('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def update_choice_candidates(obj_id):
|
||||
obj_id = int(obj_id)
|
||||
cands_obj_id = cands[obj_id]
|
||||
choices = list(range(1, len(cands_obj_id)+1))
|
||||
return choices
|
||||
|
||||
@callback(
|
||||
Output('graph-output', 'figure'),
|
||||
Input('choice-candidates', 'value'),
|
||||
State('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def update_timeline(index, obj_id):
|
||||
obj_id = int(obj_id)
|
||||
# title
|
||||
obj_text = texts[obj_id]
|
||||
title = f'HObjektText: {obj_text}'
|
||||
# cands
|
||||
cands_obj_id = cands[obj_id]
|
||||
cands_choice = cands_obj_id[int(index)-1]
|
||||
# data
|
||||
df = data.loc[list(cands_choice)].sort_index()
|
||||
# figure
|
||||
fig = px.line(
|
||||
data_frame=df,
|
||||
x='ErstellungsDatum',
|
||||
y='ObjektID',
|
||||
title=title,
|
||||
hover_data=hover_data,
|
||||
)
|
||||
fig.update_traces(
|
||||
mode='markers+lines',
|
||||
marker=markers,
|
||||
marker_symbol='diamond'
|
||||
)
|
||||
fig.update_xaxes(
|
||||
tickformat="%B\n%Y",
|
||||
rangeslider_visible=True,
|
||||
)
|
||||
fig.update_yaxes(type='category')
|
||||
fig.update_layout(hovermode="x unified")
|
||||
return fig
|
||||
|
||||
@callback(
|
||||
[Output('table-candidates', 'data'),
|
||||
Output('table-candidates', 'columns')],
|
||||
Input('choice-candidates', 'value'),
|
||||
State('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def update_table_candidates(index, obj_id):
|
||||
obj_id = int(obj_id)
|
||||
# cands
|
||||
cands_obj_id = cands[obj_id]
|
||||
cands_choice = cands_obj_id[int(index)-1]
|
||||
# data
|
||||
df = data.loc[list(cands_choice)].sort_index()
|
||||
df = (df
|
||||
.filter(items=table_feats, axis=1)
|
||||
.sort_values(by='ErstellungsDatum', ascending=True))
|
||||
cols = [{"name": i, "id": i} for i in df.columns]
|
||||
# convert dates to strings
|
||||
for col in table_feats_dates:
|
||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||
|
||||
table_data = df.to_dict('records')
|
||||
return table_data, cols
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
BIN
test-notebooks/dashboard/data.pkl
Normal file
BIN
test-notebooks/dashboard/data.pkl
Normal file
Binary file not shown.
BIN
test-notebooks/dashboard/map_candidates.pkl
Normal file
BIN
test-notebooks/dashboard/map_candidates.pkl
Normal file
Binary file not shown.
BIN
test-notebooks/dashboard/map_texts.pkl
Normal file
BIN
test-notebooks/dashboard/map_texts.pkl
Normal file
Binary file not shown.
2335
test-notebooks/timeline_analysis.ipynb
Normal file
2335
test-notebooks/timeline_analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user