enhanced timeline analysis
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
from typing import Final, Any
|
||||
import inspect
|
||||
import sys
|
||||
import logging
|
||||
from time import gmtime
|
||||
from pathlib import Path
|
||||
|
||||
from lang_main.shared import (
|
||||
@@ -11,7 +14,6 @@ from lang_main.shared import (
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
|
||||
|
||||
__all__ = [
|
||||
'save_pickle',
|
||||
'load_pickle',
|
||||
@@ -21,6 +23,15 @@ __all__ = [
|
||||
'TokenGraph',
|
||||
]
|
||||
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout,
|
||||
format=LOG_FMT,
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||
|
||||
# load config data: internal/external
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import typing
|
||||
from typing import Any, Self, Literal, overload, Final
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Hashable
|
||||
from pathlib import Path
|
||||
import copy
|
||||
@@ -12,14 +11,12 @@ from networkx import Graph, DiGraph
|
||||
import networkx as nx
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
|
||||
# TODO change logging behaviour, add logging to file
|
||||
LOGGING_DEFAULT: Final[bool] = False
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.graphs')
|
||||
|
||||
def get_graph_metadata(
|
||||
graph: Graph | DiGraph,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from typing import cast, Callable
|
||||
from collections.abc import Iterable
|
||||
import sys
|
||||
import logging
|
||||
from itertools import combinations
|
||||
import re
|
||||
from math import factorial
|
||||
@@ -19,6 +17,7 @@ import sentence_transformers.util
|
||||
from tqdm import tqdm
|
||||
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.shared import (
|
||||
similar_index_connection_graph,
|
||||
@@ -27,10 +26,6 @@ from lang_main.analysis.shared import (
|
||||
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.preprocess')
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
# following functions used to load a given dataset and perform simple
|
||||
# duplicate cleansing based on all properties
|
||||
@@ -436,6 +431,7 @@ def merge_similarity_dupl(
|
||||
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
|
||||
|
||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||
similar_id_group = list(similar_id_group)
|
||||
similar_data = merged_data.loc[similar_id_group,:]
|
||||
# keep first entry with max number occurrences, then number of
|
||||
# associated objects, then length of entry
|
||||
|
||||
@@ -19,16 +19,17 @@ def similar_index_connection_graph(
|
||||
# inplace operation, parent/child do not really exist in undirected graph
|
||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
||||
|
||||
return similar_id_graph, graph_info
|
||||
|
||||
# TODO check returning tuple
|
||||
def similar_index_groups(
|
||||
similar_id_graph: Graph,
|
||||
) -> Iterator[list[PandasIndex]]:
|
||||
) -> Iterator[tuple[PandasIndex, ...]]:
|
||||
# groups of connected indices
|
||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
nx.connected_components(G=similar_id_graph))
|
||||
|
||||
for id_group in ids_groups:
|
||||
yield list(id_group)
|
||||
yield tuple(id_group)
|
||||
@@ -1,6 +1,4 @@
|
||||
from typing import cast
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Iterable, Iterator
|
||||
|
||||
import numpy as np
|
||||
@@ -12,16 +10,13 @@ import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from tqdm.auto import tqdm # TODO: check deletion
|
||||
|
||||
from lang_main.types import PandasIndex, ObjectID
|
||||
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.analysis.shared import (
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.time_analysis')
|
||||
|
||||
def non_relevant_obj_ids(
|
||||
data: DataFrame,
|
||||
@@ -42,6 +37,8 @@ def non_relevant_obj_ids(
|
||||
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
|
||||
)
|
||||
# check for uniqueness of given feature for current ObjectID
|
||||
# ignore NaN values
|
||||
feats_per_obj_id = feats_per_obj_id.dropna()
|
||||
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
|
||||
|
||||
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
|
||||
@@ -56,7 +53,7 @@ def remove_non_relevant_obj_ids(
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> DataFrame:
|
||||
|
||||
logger.info("Removing non-relevant ObjectIDs from dataset")
|
||||
data = data.copy()
|
||||
ids_to_ignore = non_relevant_obj_ids(
|
||||
data=data,
|
||||
@@ -65,7 +62,9 @@ def remove_non_relevant_obj_ids(
|
||||
feature_obj_id=feature_obj_id,
|
||||
)
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~data[feature_obj_id].isin(ids_to_ignore)]
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
|
||||
logger.info("Non-relevant ObjectIDs removed successfully")
|
||||
|
||||
return data
|
||||
|
||||
@@ -80,14 +79,13 @@ def filter_activities_per_obj_id(
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
#relevant_activity_types = list(relevant_activity_types) # TODO: check deletion
|
||||
logger.info("Filtering activities per ObjectID")
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series,
|
||||
data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
|
||||
# index of series contains ObjectIDs
|
||||
@@ -97,6 +95,7 @@ def filter_activities_per_obj_id(
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info("Activities per ObjectID filtered successfully")
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
@@ -109,7 +108,7 @@ def generate_model_input(
|
||||
'VorgangsBeschreibung',
|
||||
),
|
||||
) -> DataFrame:
|
||||
|
||||
logger.info("Generating concatenation of model input features")
|
||||
data = data.copy()
|
||||
model_input_features = list(model_input_features)
|
||||
input_features = data[model_input_features].fillna('').astype(str)
|
||||
@@ -117,6 +116,7 @@ def generate_model_input(
|
||||
lambda x: ' - '.join(x),
|
||||
axis=1,
|
||||
)
|
||||
logger.info("Model input generated successfully")
|
||||
|
||||
return data
|
||||
|
||||
@@ -133,16 +133,17 @@ def generate_model_input(
|
||||
def get_timeline_candidates_index(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
*,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> Iterator[tuple[ObjectID, list[PandasIndex]]]:
|
||||
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
||||
# already sorted ObjIDs (descending regarding number of activities)
|
||||
obj_ids = cast(Iterable[ObjectID],
|
||||
num_activities_per_obj_id.index)
|
||||
|
||||
for obj_id in obj_ids:
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj_id = cast(
|
||||
DataFrame,
|
||||
data.loc[data[feature_obj_id]==obj_id]
|
||||
@@ -220,7 +221,58 @@ def candidates_by_index(
|
||||
yield idx_pair
|
||||
|
||||
|
||||
"""
|
||||
next part:
|
||||
def transform_timeline_candidates(
|
||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||
) -> TimelineCandidates:
|
||||
"""function to build a mapping of ObjectIDs to their respective collection of
|
||||
timeline candidates (as tuple), each candidate group is separated as distinct
|
||||
tuple within this outer tuple
|
||||
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
|
||||
Iterator provided by ``get_timeline_candidates_index``
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
dictionary: ObjectID -> tuple of candidate groups
|
||||
"""
|
||||
|
||||
candidates_by_obj_id: TimelineCandidates = {}
|
||||
|
||||
obj_id_target: ObjectID | None = None
|
||||
collection: list[tuple[PandasIndex, ...]] = []
|
||||
|
||||
for obj_id, cands in candidates:
|
||||
if obj_id_target is None:
|
||||
collection = []
|
||||
obj_id_target = obj_id
|
||||
elif obj_id_target != obj_id:
|
||||
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
||||
collection = []
|
||||
obj_id_target = obj_id
|
||||
collection.append(cands)
|
||||
|
||||
if collection and obj_id_target is not None:
|
||||
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
||||
|
||||
return candidates_by_obj_id
|
||||
|
||||
def map_obj_texts(
|
||||
data: DataFrame,
|
||||
obj_ids: Iterable[ObjectID],
|
||||
) -> dict[ObjectID, str]:
|
||||
obj_id_to_text: dict[ObjectID, str] = {}
|
||||
|
||||
for obj_id in obj_ids:
|
||||
data_per_obj = cast(
|
||||
DataFrame,
|
||||
data.loc[data['ObjektID']==obj_id]
|
||||
)
|
||||
# just take first entry
|
||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||
obj_text = obj_text.strip(r' ,.:')
|
||||
obj_id_to_text[obj_id] = obj_text
|
||||
|
||||
return obj_id_to_text
|
||||
@@ -1,6 +1,4 @@
|
||||
from typing import cast
|
||||
import sys
|
||||
import logging
|
||||
import re
|
||||
from itertools import combinations
|
||||
from collections.abc import Iterator
|
||||
@@ -12,6 +10,7 @@ from spacy.lang.de import German as GermanSpacyModel
|
||||
from pandas import DataFrame
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
from lang_main.analysis.graphs import (
|
||||
update_graph,
|
||||
TokenGraph,
|
||||
@@ -19,9 +18,9 @@ from lang_main.analysis.graphs import (
|
||||
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
#LOGGING_LEVEL = 'INFO'
|
||||
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
#logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
|
||||
# ** POS
|
||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
|
||||
24
src/lang_main/loggers.py
Normal file
24
src/lang_main/loggers.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from typing import Final
|
||||
import logging
|
||||
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
|
||||
|
||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||
logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')
|
||||
logger_graphs = logging.getLogger('lang_main.analysis.graphs')
|
||||
logger_graphs.setLevel(LOGGING_LEVEL_GRAPHS)
|
||||
logger_timeline = logging.getLogger('lang_main.analysis.timeline')
|
||||
logger_timeline.setLevel(LOGGING_LEVEL_TIMELINE)
|
||||
logger_token_analysis = logging.getLogger('lang_main.analysis.tokens')
|
||||
logger_token_analysis.setLevel(LOGGING_LEVEL_TOKEN_ANALYSIS)
|
||||
logger_preprocess.setLevel(LOGGING_LEVEL_PREPROCESS)
|
||||
logger_pipelines = logging.getLogger('lang_main.pipelines')
|
||||
logger_pipelines.setLevel(LOGGING_LEVEL_PIPELINES)
|
||||
@@ -5,14 +5,9 @@ import logging
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
|
||||
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.pipelines')
|
||||
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
@@ -94,8 +89,9 @@ class BasePipeline():
|
||||
self,
|
||||
filename: str,
|
||||
) -> None:
|
||||
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename + '.pickle'
|
||||
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
|
||||
target_path = self.working_dir.joinpath(target_filename)
|
||||
target_path = target_path.with_suffix('.pkl')
|
||||
# saving file locally
|
||||
save_pickle(obj=self._intermediate_result, path=target_path)
|
||||
|
||||
@@ -104,7 +100,7 @@ class BasePipeline():
|
||||
saving_path: str,
|
||||
filename: str,
|
||||
) -> tuple[Any, ...]:
|
||||
target_path = saving_path + filename + '.pickle'
|
||||
target_path = Path(saving_path + filename).with_suffix('.pkl')
|
||||
# loading DataFrame or Series from pickle
|
||||
data = load_pickle(target_path)
|
||||
|
||||
|
||||
@@ -22,15 +22,6 @@ from lang_main.analysis.preprocessing import (
|
||||
)
|
||||
from lang_main.analysis.tokens import build_token_graph
|
||||
|
||||
"""
|
||||
# ** config parameters
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
||||
CONFIG['export_filenames']['filename_cossim_filter_candidates']
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
"""
|
||||
|
||||
# ** pipeline configuration
|
||||
# ** target feature preparation
|
||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||
|
||||
@@ -1,16 +1,11 @@
|
||||
from typing import Any
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import pickle
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
|
||||
# ** Logging
|
||||
LOGGING_LEVEL = 'INFO'
|
||||
logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
logger = logging.getLogger('ihm_analyse.helpers')
|
||||
from lang_main.loggers import logger_shared_helpers as logger
|
||||
|
||||
# ** Lib
|
||||
def create_saving_folder(
|
||||
|
||||
@@ -1,9 +1,19 @@
|
||||
from typing import TypeAlias
|
||||
from typing import TypeAlias, Literal
|
||||
|
||||
import numpy as np
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
|
||||
LoggingLevels: TypeAlias = Literal[
|
||||
'DEBUG',
|
||||
'INFO',
|
||||
'WARNING',
|
||||
'ERROR',
|
||||
'CRITICAL',
|
||||
]
|
||||
|
||||
PandasIndex: TypeAlias = int | np.int64
|
||||
ObjectID: TypeAlias = int
|
||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||
|
||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
Reference in New Issue
Block a user