332 lines
11 KiB
Python
332 lines
11 KiB
Python
from collections.abc import Collection, Iterable, Iterator
|
|
from typing import cast
|
|
|
|
from pandas import DataFrame, Series
|
|
from sentence_transformers import SentenceTransformer
|
|
from tqdm.auto import tqdm # TODO: check deletion
|
|
|
|
from lang_main.analysis.shared import (
|
|
candidates_by_index,
|
|
entry_wise_cleansing,
|
|
pattern_escape_seq_sentences,
|
|
similar_index_connection_graph,
|
|
similar_index_groups,
|
|
)
|
|
from lang_main.loggers import logger_timeline as logger
|
|
from lang_main.types import (
|
|
DataFrameTLFiltered,
|
|
ObjectID,
|
|
PandasIndex,
|
|
TimelineCandidates,
|
|
)
|
|
|
|
|
|
def _non_relevant_obj_ids(
|
|
data: DataFrame,
|
|
thresh_unique_feat_per_id: int,
|
|
*,
|
|
feature_uniqueness: str = 'HObjektText',
|
|
feature_obj_id: str = 'ObjektID',
|
|
) -> tuple[ObjectID, ...]:
|
|
data = data.copy()
|
|
ids_to_ignore: set[ObjectID] = set()
|
|
obj_ids = cast(
|
|
Iterable[ObjectID], # actually NumPy array
|
|
data[feature_obj_id].unique(),
|
|
)
|
|
|
|
for obj_id in obj_ids:
|
|
feats_per_obj_id = cast(
|
|
Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
|
|
)
|
|
# check for uniqueness of given feature for current ObjectID
|
|
# ignore NaN values
|
|
feats_per_obj_id = feats_per_obj_id.dropna()
|
|
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
|
|
|
|
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
|
|
ids_to_ignore.add(obj_id)
|
|
|
|
return tuple(ids_to_ignore)
|
|
|
|
|
|
def remove_non_relevant_obj_ids(
|
|
data: DataFrame,
|
|
thresh_unique_feat_per_id: int,
|
|
*,
|
|
feature_uniqueness: str = 'HObjektText',
|
|
feature_obj_id: str = 'ObjektID',
|
|
) -> tuple[DataFrame]:
|
|
logger.info('Removing non-relevant ObjectIDs from dataset...')
|
|
data = data.copy()
|
|
ids_to_ignore = _non_relevant_obj_ids(
|
|
data=data,
|
|
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
|
|
feature_uniqueness=feature_uniqueness,
|
|
feature_obj_id=feature_obj_id,
|
|
)
|
|
# only retain entries with ObjectIDs not in IDs to ignore
|
|
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
|
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
|
logger.info('Non-relevant ObjectIDs removed successfully.')
|
|
|
|
return (data,)
|
|
|
|
|
|
def cleanup_descriptions(
|
|
data: DataFrame,
|
|
properties: Collection[str] = (
|
|
'VorgangsBeschreibung',
|
|
'ErledigungsBeschreibung',
|
|
),
|
|
) -> tuple[DataFrame]:
|
|
logger.info('Cleaning necessary descriptions...')
|
|
data = data.copy()
|
|
features = list(properties)
|
|
data[features] = data[features].fillna('N.V.')
|
|
(data,) = entry_wise_cleansing(data, target_features=features)
|
|
logger.info('Cleansing successful.')
|
|
|
|
return (data.copy(),)
|
|
|
|
|
|
def calc_delta_to_repair(
|
|
data: DataFrame,
|
|
date_feature_start: str = 'ErstellungsDatum',
|
|
date_feature_end: str = 'ErledigungsDatum',
|
|
name_delta_feature: str = 'delta_to_repair',
|
|
convert_to_days: bool = True,
|
|
) -> tuple[DataFrame]:
|
|
logger.info('Calculating time differences between start and end of operations...')
|
|
data = data.copy()
|
|
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
|
|
|
|
if convert_to_days:
|
|
data[name_delta_feature] = data[name_delta_feature].dt.days
|
|
|
|
logger.info('Calculation successful.')
|
|
|
|
return (data,)
|
|
|
|
|
|
def generate_model_input(
|
|
data: DataFrame,
|
|
target_feature_name: str = 'nlp_model_input',
|
|
model_input_features: Iterable[str] = (
|
|
'VorgangsTypName',
|
|
'VorgangsArtText',
|
|
'VorgangsBeschreibung',
|
|
),
|
|
) -> tuple[DataFrame]:
|
|
logger.info('Generating concatenation of model input features...')
|
|
data = data.copy()
|
|
model_input_features = list(model_input_features)
|
|
input_features = data[model_input_features].fillna('').astype(str)
|
|
data[target_feature_name] = input_features.apply(
|
|
lambda x: ' - '.join(x),
|
|
axis=1,
|
|
)
|
|
logger.info('Model input generated successfully.')
|
|
|
|
return (data,)
|
|
|
|
|
|
def filter_activities_per_obj_id(
|
|
data: DataFrame,
|
|
activity_feature: str = 'VorgangsTypName',
|
|
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
|
|
feature_obj_id: str = 'ObjektID',
|
|
threshold_num_activities: int = 1,
|
|
) -> tuple[DataFrame, Series]:
|
|
data = data.copy()
|
|
# filter only relevant activities count occurrences for each ObjectID
|
|
logger.info('Filtering activities per ObjectID...')
|
|
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
|
data_filter_activities = data.loc[filt_rel_activities].copy()
|
|
num_activities_per_obj_id = cast(
|
|
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
|
|
)
|
|
# filter for ObjectIDs with more than given number of activities
|
|
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
|
|
# index of series contains ObjectIDs
|
|
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
|
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
|
|
obj_ids_below_thresh
|
|
)
|
|
|
|
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
|
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
|
logger.info('Activities per ObjectID filtered successfully.')
|
|
|
|
return data_filter_activities, num_activities_per_obj_id
|
|
|
|
|
|
# for each obj_id in relevant_obj_ids
|
|
## filter data for obj_id
|
|
## obtain series of (idx, nlp_model_input)
|
|
## make batch of nlp_model_input
|
|
## obtain embeddings
|
|
## calculate cosine similarity
|
|
## filter cosine similarity by threshold
|
|
## obtain idx pairs, yield
|
|
## use idx pairs to get idx values of series
|
|
|
|
|
|
def _get_timeline_candidates_index(
|
|
data: DataFrame,
|
|
num_activities_per_obj_id: Series,
|
|
*,
|
|
model: SentenceTransformer,
|
|
cos_sim_threshold: float,
|
|
feature_obj_id: str = 'ObjektID',
|
|
model_input_feature: str = 'nlp_model_input',
|
|
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
|
# already sorted ObjIDs (descending regarding number of activities)
|
|
obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
|
|
|
|
for obj_id in tqdm(obj_ids):
|
|
data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
|
|
data_model_input = data_per_obj_id[model_input_feature]
|
|
|
|
candidates_idx = candidates_by_index(
|
|
data_model_input=data_model_input,
|
|
model=model,
|
|
cos_sim_threshold=cos_sim_threshold,
|
|
)
|
|
# directly process candidates
|
|
# candidates_idx = tuple(candidates_idx)
|
|
similar_id_graph, _ = similar_index_connection_graph(
|
|
similar_idx_pairs=candidates_idx,
|
|
)
|
|
|
|
for index_group in similar_index_groups(similar_id_graph):
|
|
yield obj_id, index_group
|
|
|
|
|
|
# TODO: check application for duplicate removal
|
|
def _transform_timeline_candidates(
|
|
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
|
) -> TimelineCandidates:
|
|
"""function to build a mapping of ObjectIDs to their respective collection of
|
|
timeline candidates (as tuple), each candidate group is separated as distinct
|
|
tuple within this outer tuple
|
|
|
|
Parameters
|
|
----------
|
|
candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]
|
|
Iterator provided by ``get_timeline_candidates_index``
|
|
|
|
Returns
|
|
-------
|
|
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
|
dictionary: ObjectID -> tuple of candidate groups
|
|
"""
|
|
|
|
candidates_by_obj_id: TimelineCandidates = {}
|
|
|
|
obj_id_target: ObjectID | None = None
|
|
collection: list[tuple[PandasIndex, ...]] = []
|
|
|
|
for obj_id, cands in candidates:
|
|
if obj_id_target is None:
|
|
collection = []
|
|
obj_id_target = obj_id
|
|
elif obj_id_target != obj_id:
|
|
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
|
collection = []
|
|
obj_id_target = obj_id
|
|
collection.append(cands)
|
|
|
|
if collection and obj_id_target is not None:
|
|
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
|
|
|
return candidates_by_obj_id
|
|
|
|
|
|
def _map_obj_id_to_texts(
|
|
data: DataFrame,
|
|
feature_obj_id: str = 'ObjektID',
|
|
) -> dict[ObjectID, str]:
|
|
data = data.copy()
|
|
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
|
|
|
|
obj_id_to_text: dict[ObjectID, str] = {}
|
|
|
|
for obj_id in tqdm(obj_ids):
|
|
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
|
|
# just take first entry
|
|
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
|
obj_text = obj_text.strip(r' ,.:')
|
|
obj_id_to_text[obj_id] = obj_text
|
|
|
|
return obj_id_to_text
|
|
|
|
|
|
def get_timeline_candidates(
|
|
data: DataFrame,
|
|
num_activities_per_obj_id: Series,
|
|
*,
|
|
model: SentenceTransformer,
|
|
cos_sim_threshold: float,
|
|
feature_obj_id: str = 'ObjektID',
|
|
model_input_feature: str = 'nlp_model_input',
|
|
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
|
logger.info('Obtaining timeline candidates...')
|
|
candidates = _get_timeline_candidates_index(
|
|
data=data,
|
|
num_activities_per_obj_id=num_activities_per_obj_id,
|
|
model=model,
|
|
cos_sim_threshold=cos_sim_threshold,
|
|
feature_obj_id=feature_obj_id,
|
|
model_input_feature=model_input_feature,
|
|
)
|
|
tl_candidates = _transform_timeline_candidates(candidates)
|
|
logger.info('Timeline candidates obtained successfully.')
|
|
# text mapping to obtain object descriptors
|
|
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
|
map_obj_text = _map_obj_id_to_texts(
|
|
data=data,
|
|
feature_obj_id=feature_obj_id,
|
|
)
|
|
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
|
|
|
return tl_candidates, map_obj_text
|
|
|
|
|
|
# ** Postprocessing
|
|
# filter original dataset for a batch of timeline candidates
|
|
def filter_timeline_cands(
|
|
data: DataFrame,
|
|
cands: TimelineCandidates,
|
|
obj_id: ObjectID,
|
|
entry_idx: int,
|
|
sort_feature: str = 'ErstellungsDatum',
|
|
) -> DataFrameTLFiltered:
|
|
data = data.copy()
|
|
cands_for_obj_id = cands[obj_id]
|
|
cands_choice = cands_for_obj_id[entry_idx]
|
|
data = data.loc[list(cands_choice)].sort_values(
|
|
by=sort_feature,
|
|
ascending=True,
|
|
)
|
|
|
|
return data
|
|
|
|
|
|
def calc_delta_to_next_failure(
|
|
data: DataFrameTLFiltered,
|
|
date_feature: str = 'ErstellungsDatum',
|
|
name_delta_feature: str = 'delta_to_next_failure',
|
|
convert_to_days: bool = True,
|
|
) -> DataFrameTLFiltered:
|
|
data = data.copy()
|
|
last_val = data[date_feature].iat[-1]
|
|
shifted = data[date_feature].shift(-1, fill_value=last_val)
|
|
data[name_delta_feature] = shifted - data[date_feature]
|
|
data = data.sort_values(by=name_delta_feature, ascending=False)
|
|
|
|
if convert_to_days:
|
|
data[name_delta_feature] = data[name_delta_feature].dt.days
|
|
|
|
return data
|