enhanced timeline analysis

This commit is contained in:
Florian Förster
2024-05-22 18:11:46 +02:00
parent df16b29191
commit 5d2c97165a
18 changed files with 2789 additions and 75 deletions

View File

@@ -1,5 +1,8 @@
from typing import Final, Any
import inspect
import sys
import logging
from time import gmtime
from pathlib import Path
from lang_main.shared import (
@@ -11,7 +14,6 @@ from lang_main.shared import (
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.analysis.graphs import TokenGraph
__all__ = [
'save_pickle',
'load_pickle',
@@ -21,6 +23,15 @@ __all__ = [
'TokenGraph',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
format=LOG_FMT,
datefmt=LOG_DATE_FMT,
)
USE_INTERNAL_CONFIG: Final[bool] = True
# load config data: internal/external

View File

@@ -1,7 +1,6 @@
import typing
from typing import Any, Self, Literal, overload, Final
import sys
import logging
from collections.abc import Hashable
from pathlib import Path
import copy
@@ -12,14 +11,12 @@ from networkx import Graph, DiGraph
import networkx as nx
from pandas import DataFrame
from lang_main.loggers import logger_graphs as logger
from lang_main.shared import save_pickle, load_pickle
# TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.graphs')
def get_graph_metadata(
graph: Graph | DiGraph,

View File

@@ -1,7 +1,5 @@
from typing import cast, Callable
from collections.abc import Iterable
import sys
import logging
from itertools import combinations
import re
from math import factorial
@@ -19,6 +17,7 @@ import sentence_transformers.util
from tqdm import tqdm
from lang_main.types import Embedding, PandasIndex
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.shared import (
similar_index_connection_graph,
@@ -27,10 +26,6 @@ from lang_main.analysis.shared import (
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.preprocess')
# ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple
# duplicate cleansing based on all properties
@@ -436,6 +431,7 @@ def merge_similarity_dupl(
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group,:]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry

View File

@@ -19,16 +19,17 @@ def similar_index_connection_graph(
# inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info
# TODO check returning tuple
def similar_index_groups(
similar_id_graph: Graph,
) -> Iterator[list[PandasIndex]]:
) -> Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]],
nx.connected_components(G=similar_id_graph))
for id_group in ids_groups:
yield list(id_group)
yield tuple(id_group)

View File

@@ -1,6 +1,4 @@
from typing import cast
import sys
import logging
from collections.abc import Iterable, Iterator
import numpy as np
@@ -12,16 +10,13 @@ import sentence_transformers
import sentence_transformers.util
from tqdm.auto import tqdm # TODO: check deletion
from lang_main.types import PandasIndex, ObjectID
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
from lang_main.loggers import logger_timeline as logger
from lang_main.analysis.shared import (
similar_index_connection_graph,
similar_index_groups,
)
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.time_analysis')
def non_relevant_obj_ids(
data: DataFrame,
@@ -42,6 +37,8 @@ def non_relevant_obj_ids(
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
)
# check for uniqueness of given feature for current ObjectID
# ignore NaN values
feats_per_obj_id = feats_per_obj_id.dropna()
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
@@ -56,7 +53,7 @@ def remove_non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID',
) -> DataFrame:
logger.info("Removing non-relevant ObjectIDs from dataset")
data = data.copy()
ids_to_ignore = non_relevant_obj_ids(
data=data,
@@ -65,7 +62,9 @@ def remove_non_relevant_obj_ids(
feature_obj_id=feature_obj_id,
)
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~data[feature_obj_id].isin(ids_to_ignore)]
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
logger.info("Non-relevant ObjectIDs removed successfully")
return data
@@ -80,14 +79,13 @@ def filter_activities_per_obj_id(
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
#relevant_activity_types = list(relevant_activity_types) # TODO: check deletion
logger.info("Filtering activities per ObjectID")
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series,
data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
# index of series contains ObjectIDs
@@ -97,6 +95,7 @@ def filter_activities_per_obj_id(
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info("Activities per ObjectID filtered successfully")
return data_filter_activities, num_activities_per_obj_id
@@ -109,7 +108,7 @@ def generate_model_input(
'VorgangsBeschreibung',
),
) -> DataFrame:
logger.info("Generating concatenation of model input features")
data = data.copy()
model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str)
@@ -117,6 +116,7 @@ def generate_model_input(
lambda x: ' - '.join(x),
axis=1,
)
logger.info("Model input generated successfully")
return data
@@ -133,16 +133,17 @@ def generate_model_input(
def get_timeline_candidates_index(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
model: SentenceTransformer,
cos_sim_threshold: float,
feature_obj_id: str = 'ObjektID',
model_input_feature: str = 'nlp_model_input',
) -> Iterator[tuple[ObjectID, list[PandasIndex]]]:
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
# already sorted ObjIDs (descending regarding number of activities)
obj_ids = cast(Iterable[ObjectID],
num_activities_per_obj_id.index)
for obj_id in obj_ids:
for obj_id in tqdm(obj_ids):
data_per_obj_id = cast(
DataFrame,
data.loc[data[feature_obj_id]==obj_id]
@@ -220,7 +221,58 @@ def candidates_by_index(
yield idx_pair
"""
next part:
def transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates:
"""function to build a mapping of ObjectIDs to their respective collection of
timeline candidates (as tuple), each candidate group is separated as distinct
tuple within this outer tuple
"""
Parameters
----------
candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
Iterator provided by ``get_timeline_candidates_index``
Returns
-------
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
dictionary: ObjectID -> tuple of candidate groups
"""
candidates_by_obj_id: TimelineCandidates = {}
obj_id_target: ObjectID | None = None
collection: list[tuple[PandasIndex, ...]] = []
for obj_id, cands in candidates:
if obj_id_target is None:
collection = []
obj_id_target = obj_id
elif obj_id_target != obj_id:
candidates_by_obj_id[obj_id_target] = tuple(collection)
collection = []
obj_id_target = obj_id
collection.append(cands)
if collection and obj_id_target is not None:
candidates_by_obj_id[obj_id_target] = tuple(collection)
return candidates_by_obj_id
def map_obj_texts(
data: DataFrame,
obj_ids: Iterable[ObjectID],
) -> dict[ObjectID, str]:
obj_id_to_text: dict[ObjectID, str] = {}
for obj_id in obj_ids:
data_per_obj = cast(
DataFrame,
data.loc[data['ObjektID']==obj_id]
)
# just take first entry
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
obj_text = obj_text.strip(r' ,.:')
obj_id_to_text[obj_id] = obj_text
return obj_id_to_text

View File

@@ -1,6 +1,4 @@
from typing import cast
import sys
import logging
import re
from itertools import combinations
from collections.abc import Iterator
@@ -12,6 +10,7 @@ from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame
from tqdm.auto import tqdm
from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import (
update_graph,
TokenGraph,
@@ -19,9 +18,9 @@ from lang_main.analysis.graphs import (
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.token_analysis')
#LOGGING_LEVEL = 'INFO'
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
#logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])

24
src/lang_main/loggers.py Normal file
View File

@@ -0,0 +1,24 @@
from typing import Final
import logging
from lang_main.types import LoggingLevels
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')
logger_graphs = logging.getLogger('lang_main.analysis.graphs')
logger_graphs.setLevel(LOGGING_LEVEL_GRAPHS)
logger_timeline = logging.getLogger('lang_main.analysis.timeline')
logger_timeline.setLevel(LOGGING_LEVEL_TIMELINE)
logger_token_analysis = logging.getLogger('lang_main.analysis.tokens')
logger_token_analysis.setLevel(LOGGING_LEVEL_TOKEN_ANALYSIS)
logger_preprocess.setLevel(LOGGING_LEVEL_PREPROCESS)
logger_pipelines = logging.getLogger('lang_main.pipelines')
logger_pipelines.setLevel(LOGGING_LEVEL_PIPELINES)

View File

@@ -5,14 +5,9 @@ import logging
from collections.abc import Callable
from pathlib import Path
from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import save_pickle, load_pickle
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.pipelines')
# ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception):
@@ -94,8 +89,9 @@ class BasePipeline():
self,
filename: str,
) -> None:
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename + '.pickle'
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
target_path = self.working_dir.joinpath(target_filename)
target_path = target_path.with_suffix('.pkl')
# saving file locally
save_pickle(obj=self._intermediate_result, path=target_path)
@@ -104,7 +100,7 @@ class BasePipeline():
saving_path: str,
filename: str,
) -> tuple[Any, ...]:
target_path = saving_path + filename + '.pickle'
target_path = Path(saving_path + filename).with_suffix('.pkl')
# loading DataFrame or Series from pickle
data = load_pickle(target_path)

View File

@@ -22,15 +22,6 @@ from lang_main.analysis.preprocessing import (
)
from lang_main.analysis.tokens import build_token_graph
"""
# ** config parameters
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
CONFIG['export_filenames']['filename_cossim_filter_candidates']
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
"""
# ** pipeline configuration
# ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)

View File

@@ -1,16 +1,11 @@
from typing import Any
import sys
import os
import shutil
import logging
import pickle
import tomllib
from pathlib import Path
# ** Logging
LOGGING_LEVEL = 'INFO'
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
logger = logging.getLogger('ihm_analyse.helpers')
from lang_main.loggers import logger_shared_helpers as logger
# ** Lib
def create_saving_folder(

View File

@@ -1,9 +1,19 @@
from typing import TypeAlias
from typing import TypeAlias, Literal
import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
LoggingLevels: TypeAlias = Literal[
'DEBUG',
'INFO',
'WARNING',
'ERROR',
'CRITICAL',
]
PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int
Embedding: TypeAlias = SpacyDoc | Tensor
Embedding: TypeAlias = SpacyDoc | Tensor
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]