from collections.abc import Collection, Iterable, Iterator from typing import cast from pandas import DataFrame, Series from sentence_transformers import SentenceTransformer from tqdm.auto import tqdm # TODO: check deletion from lang_main.analysis.shared import ( candidates_by_index, entry_wise_cleansing, pattern_escape_seq_sentences, similar_index_connection_graph, similar_index_groups, ) from lang_main.loggers import logger_timeline as logger from lang_main.types import ( DataFrameTLFiltered, ObjectID, PandasIndex, TimelineCandidates, ) def _non_relevant_obj_ids( data: DataFrame, thresh_unique_feat_per_id: int, *, feature_uniqueness: str = 'HObjektText', feature_obj_id: str = 'ObjektID', ) -> tuple[ObjectID, ...]: data = data.copy() ids_to_ignore: set[ObjectID] = set() obj_ids = cast( Iterable[ObjectID], # actually NumPy array data[feature_obj_id].unique(), ) for obj_id in obj_ids: feats_per_obj_id = cast( Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness] ) # check for uniqueness of given feature for current ObjectID # ignore NaN values feats_per_obj_id = feats_per_obj_id.dropna() unique_feats_per_obj_id = len(feats_per_obj_id.unique()) if unique_feats_per_obj_id > thresh_unique_feat_per_id: ids_to_ignore.add(obj_id) return tuple(ids_to_ignore) def remove_non_relevant_obj_ids( data: DataFrame, thresh_unique_feat_per_id: int, *, feature_uniqueness: str = 'HObjektText', feature_obj_id: str = 'ObjektID', ) -> tuple[DataFrame]: logger.info('Removing non-relevant ObjectIDs from dataset...') data = data.copy() ids_to_ignore = _non_relevant_obj_ids( data=data, thresh_unique_feat_per_id=thresh_unique_feat_per_id, feature_uniqueness=feature_uniqueness, feature_obj_id=feature_obj_id, ) # only retain entries with ObjectIDs not in IDs to ignore data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] logger.debug('Ignored ObjectIDs: %s', ids_to_ignore) logger.info('Non-relevant ObjectIDs removed successfully.') return (data,) def cleanup_descriptions( data: DataFrame, properties: Collection[str] = ( 'VorgangsBeschreibung', 'ErledigungsBeschreibung', ), ) -> tuple[DataFrame]: logger.info('Cleaning necessary descriptions...') data = data.copy() features = list(properties) data[features] = data[features].fillna('N.V.') (data,) = entry_wise_cleansing(data, target_features=features) logger.info('Cleansing successful.') return (data.copy(),) def calc_delta_to_repair( data: DataFrame, date_feature_start: str = 'ErstellungsDatum', date_feature_end: str = 'ErledigungsDatum', name_delta_feature: str = 'delta_to_repair', convert_to_days: bool = True, ) -> tuple[DataFrame]: logger.info('Calculating time differences between start and end of operations...') data = data.copy() data[name_delta_feature] = data[date_feature_end] - data[date_feature_start] if convert_to_days: data[name_delta_feature] = data[name_delta_feature].dt.days logger.info('Calculation successful.') return (data,) def generate_model_input( data: DataFrame, target_feature_name: str = 'nlp_model_input', model_input_features: Iterable[str] = ( 'VorgangsTypName', 'VorgangsArtText', 'VorgangsBeschreibung', ), ) -> tuple[DataFrame]: logger.info('Generating concatenation of model input features...') data = data.copy() model_input_features = list(model_input_features) input_features = data[model_input_features].fillna('').astype(str) data[target_feature_name] = input_features.apply( lambda x: ' - '.join(x), axis=1, ) logger.info('Model input generated successfully.') return (data,) def filter_activities_per_obj_id( data: DataFrame, activity_feature: str = 'VorgangsTypName', relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',), feature_obj_id: str = 'ObjektID', threshold_num_activities: int = 1, ) -> tuple[DataFrame, Series]: data = data.copy() # filter only relevant activities count occurrences for each ObjectID logger.info('Filtering activities per ObjectID...') filt_rel_activities = data[activity_feature].isin(relevant_activity_types) data_filter_activities = data.loc[filt_rel_activities].copy() num_activities_per_obj_id = cast( Series, data_filter_activities[feature_obj_id].value_counts(sort=True) ) # filter for ObjectIDs with more than given number of activities filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities # index of series contains ObjectIDs obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin( obj_ids_below_thresh ) num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh] data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh] logger.info('Activities per ObjectID filtered successfully.') return data_filter_activities, num_activities_per_obj_id # for each obj_id in relevant_obj_ids ## filter data for obj_id ## obtain series of (idx, nlp_model_input) ## make batch of nlp_model_input ## obtain embeddings ## calculate cosine similarity ## filter cosine similarity by threshold ## obtain idx pairs, yield ## use idx pairs to get idx values of series def _get_timeline_candidates_index( data: DataFrame, num_activities_per_obj_id: Series, *, model: SentenceTransformer, cos_sim_threshold: float, feature_obj_id: str = 'ObjektID', model_input_feature: str = 'nlp_model_input', ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]: # already sorted ObjIDs (descending regarding number of activities) obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index) for obj_id in tqdm(obj_ids): data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id]) data_model_input = data_per_obj_id[model_input_feature] candidates_idx = candidates_by_index( data_model_input=data_model_input, model=model, cos_sim_threshold=cos_sim_threshold, ) # directly process candidates # candidates_idx = tuple(candidates_idx) similar_id_graph, _ = similar_index_connection_graph( similar_idx_pairs=candidates_idx, ) for index_group in similar_index_groups(similar_id_graph): yield obj_id, index_group # TODO: check application for duplicate removal def _transform_timeline_candidates( candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]], ) -> TimelineCandidates: """function to build a mapping of ObjectIDs to their respective collection of timeline candidates (as tuple), each candidate group is separated as distinct tuple within this outer tuple Parameters ---------- candidates : Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]] Iterator provided by ``get_timeline_candidates_index`` Returns ------- dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]] dictionary: ObjectID -> tuple of candidate groups """ candidates_by_obj_id: TimelineCandidates = {} obj_id_target: ObjectID | None = None collection: list[tuple[PandasIndex, ...]] = [] for obj_id, cands in candidates: if obj_id_target is None: collection = [] obj_id_target = obj_id elif obj_id_target != obj_id: candidates_by_obj_id[obj_id_target] = tuple(collection) collection = [] obj_id_target = obj_id collection.append(cands) if collection and obj_id_target is not None: candidates_by_obj_id[obj_id_target] = tuple(collection) return candidates_by_obj_id def _map_obj_id_to_texts( data: DataFrame, feature_obj_id: str = 'ObjektID', ) -> dict[ObjectID, str]: data = data.copy() obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique()) obj_id_to_text: dict[ObjectID, str] = {} for obj_id in tqdm(obj_ids): data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id]) # just take first entry obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0]) obj_text = obj_text.strip(r' ,.:') obj_id_to_text[obj_id] = obj_text return obj_id_to_text def get_timeline_candidates( data: DataFrame, num_activities_per_obj_id: Series, *, model: SentenceTransformer, cos_sim_threshold: float, feature_obj_id: str = 'ObjektID', model_input_feature: str = 'nlp_model_input', ) -> tuple[TimelineCandidates, dict[ObjectID, str]]: logger.info('Obtaining timeline candidates...') candidates = _get_timeline_candidates_index( data=data, num_activities_per_obj_id=num_activities_per_obj_id, model=model, cos_sim_threshold=cos_sim_threshold, feature_obj_id=feature_obj_id, model_input_feature=model_input_feature, ) tl_candidates = _transform_timeline_candidates(candidates) logger.info('Timeline candidates obtained successfully.') # text mapping to obtain object descriptors logger.info('Mapping ObjectIDs to their respective text descriptor...') map_obj_text = _map_obj_id_to_texts( data=data, feature_obj_id=feature_obj_id, ) logger.info('ObjectIDs successfully mapped to text descriptors.') return tl_candidates, map_obj_text # ** Postprocessing # filter original dataset for a batch of timeline candidates def filter_timeline_cands( data: DataFrame, cands: TimelineCandidates, obj_id: ObjectID, entry_idx: int, sort_feature: str = 'ErstellungsDatum', ) -> DataFrameTLFiltered: data = data.copy() cands_for_obj_id = cands[obj_id] cands_choice = cands_for_obj_id[entry_idx] data = data.loc[list(cands_choice)].sort_values( by=sort_feature, ascending=True, ) return data def calc_delta_to_next_failure( data: DataFrameTLFiltered, date_feature: str = 'ErstellungsDatum', name_delta_feature: str = 'delta_to_next_failure', convert_to_days: bool = True, ) -> DataFrameTLFiltered: data = data.copy() last_val = data[date_feature].iat[-1] shifted = data[date_feature].shift(-1, fill_value=last_val) data[name_delta_feature] = shifted - data[date_feature] data = data.sort_values(by=name_delta_feature, ascending=False) if convert_to_days: data[name_delta_feature] = data[name_delta_feature].dt.days return data