diff --git a/lang_main_config.toml b/lang_main_config.toml index b83bb12..c3d3e6e 100644 --- a/lang_main_config.toml +++ b/lang_main_config.toml @@ -1,4 +1,6 @@ # lang_main: Config file +[info] +pkg = 'lang_main' [paths] inputs = './inputs/' @@ -40,6 +42,7 @@ threshold_edge_number = 330 threshold_unique_texts = 4 criterion_feature = 'HObjektText' feature_name_obj_id = 'ObjektID' +feature_name_obj_text = 'HObjektText' [time_analysis.preparation] name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]' diff --git a/src/lang_main/analysis/shared.py b/src/lang_main/analysis/shared.py index 80579a9..a90df48 100644 --- a/src/lang_main/analysis/shared.py +++ b/src/lang_main/analysis/shared.py @@ -112,8 +112,6 @@ def candidates_by_index( ) # cosine similarity cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy()) - # TODO check removal - # cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy()) np.fill_diagonal(cos_sim, 0.0) cos_sim = np.triu(cos_sim) cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold) diff --git a/src/lang_main/analysis/timeline.py b/src/lang_main/analysis/timeline.py index 9339a7d..d5f582d 100644 --- a/src/lang_main/analysis/timeline.py +++ b/src/lang_main/analysis/timeline.py @@ -142,7 +142,7 @@ def filter_activities_per_obj_id( threshold_num_activities: int = 1, ) -> tuple[DataFrame, Series]: data = data.copy() - # filter only relevant activities count occurrences for each ObjectID + # filter only relevant activities, count occurrences for each ObjectID logger.info('Filtering activities per ObjectID...') filt_rel_activities = data[activity_feature].isin(relevant_activity_types) data_filter_activities = data.loc[filt_rel_activities].copy() @@ -249,6 +249,7 @@ def _transform_timeline_candidates( def _map_obj_id_to_texts( data: DataFrame, feature_obj_id: str = 'ObjektID', + feature_obj_text: str = 'HObjektText', ) -> dict[ObjectID, str]: data = data.copy() obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique()) @@ -256,9 +257,9 @@ def _map_obj_id_to_texts( obj_id_to_text: dict[ObjectID, str] = {} for obj_id in tqdm(obj_ids): - data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id]) + data_per_obj = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id]) # just take first entry - obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0]) + obj_text = cast(str, data_per_obj[feature_obj_text].dropna().iat[0]) obj_text = obj_text.strip(r' ,.:') obj_id_to_text[obj_id] = obj_text @@ -272,6 +273,7 @@ def get_timeline_candidates( model: SentenceTransformer, cos_sim_threshold: float, feature_obj_id: str = 'ObjektID', + feature_obj_text: str = 'HObjektText', model_input_feature: str = 'nlp_model_input', ) -> tuple[TimelineCandidates, dict[ObjectID, str]]: logger.info('Obtaining timeline candidates...') @@ -290,6 +292,7 @@ def get_timeline_candidates( map_obj_text = _map_obj_id_to_texts( data=data, feature_obj_id=feature_obj_id, + feature_obj_text=feature_obj_text, ) logger.info('ObjectIDs successfully mapped to text descriptors.') diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py index 7b7f50c..1f604a8 100644 --- a/src/lang_main/constants.py +++ b/src/lang_main/constants.py @@ -145,6 +145,9 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][ 'criterion_feature' ] FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id'] +FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][ + 'feature_name_obj_text' +] # ** time_analysis.preparation # NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair' CONFIG['time_analysis']['preparation']['name_delta_feat_to_repair'] diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml index 521c6a0..c3d3e6e 100644 --- a/src/lang_main/lang_main_config.toml +++ b/src/lang_main/lang_main_config.toml @@ -42,6 +42,7 @@ threshold_edge_number = 330 threshold_unique_texts = 4 criterion_feature = 'HObjektText' feature_name_obj_id = 'ObjektID' +feature_name_obj_text = 'HObjektText' [time_analysis.preparation] name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]' diff --git a/src/lang_main/pipelines/predefined.py b/src/lang_main/pipelines/predefined.py index 22bb04d..48337cc 100644 --- a/src/lang_main/pipelines/predefined.py +++ b/src/lang_main/pipelines/predefined.py @@ -29,6 +29,7 @@ from lang_main.constants import ( CYTO_BASE_NETWORK_NAME, DATE_COLS, FEATURE_NAME_OBJ_ID, + FEATURE_NAME_OBJ_TEXT, MODEL_INPUT_FEATURES, NAME_DELTA_FEAT_TO_REPAIR, SAVE_PATH_FOLDER, @@ -287,6 +288,7 @@ def build_timeline_pipe() -> Pipeline: 'model': STFR_MODEL, 'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY, 'feature_obj_id': FEATURE_NAME_OBJ_ID, + 'feature_obj_text': FEATURE_NAME_OBJ_TEXT, 'model_input_feature': 'nlp_model_input', }, save_result=True, diff --git a/tests/_comparison_results/preprocess_pre_cleaned.pkl b/tests/_comparison_results/preprocess_pre_cleaned.pkl new file mode 100644 index 0000000..5d8f8ef Binary files /dev/null and b/tests/_comparison_results/preprocess_pre_cleaned.pkl differ diff --git a/tests/_comparison_results/preprocess_pre_cleaned.xlsx b/tests/_comparison_results/preprocess_pre_cleaned.xlsx new file mode 100644 index 0000000..6e011f2 Binary files /dev/null and b/tests/_comparison_results/preprocess_pre_cleaned.xlsx differ diff --git a/tests/_comparison_results/timeline_01_act_per_objid.pkl b/tests/_comparison_results/timeline_01_act_per_objid.pkl new file mode 100644 index 0000000..260d019 Binary files /dev/null and b/tests/_comparison_results/timeline_01_act_per_objid.pkl differ diff --git a/tests/_comparison_results/timeline_01_act_per_objid.xlsx b/tests/_comparison_results/timeline_01_act_per_objid.xlsx new file mode 100644 index 0000000..3ade1b6 Binary files /dev/null and b/tests/_comparison_results/timeline_01_act_per_objid.xlsx differ diff --git a/tests/_comparison_results/timeline_01_df_filtered.pkl b/tests/_comparison_results/timeline_01_df_filtered.pkl new file mode 100644 index 0000000..8a614fc Binary files /dev/null and b/tests/_comparison_results/timeline_01_df_filtered.pkl differ diff --git a/tests/_comparison_results/timeline_01_df_filtered.xlsx b/tests/_comparison_results/timeline_01_df_filtered.xlsx new file mode 100644 index 0000000..82912b7 Binary files /dev/null and b/tests/_comparison_results/timeline_01_df_filtered.xlsx differ diff --git a/tests/_comparison_results/timeline_01_timeline_cands_dict.pkl b/tests/_comparison_results/timeline_01_timeline_cands_dict.pkl new file mode 100644 index 0000000..516b730 Binary files /dev/null and b/tests/_comparison_results/timeline_01_timeline_cands_dict.pkl differ diff --git a/tests/analysis/test_preprocessing.py b/tests/analysis/test_preprocessing.py index bc87f15..c52754d 100644 --- a/tests/analysis/test_preprocessing.py +++ b/tests/analysis/test_preprocessing.py @@ -3,6 +3,7 @@ executed in in a pipeline """ from pathlib import Path + from lang_main import model_loader from lang_main.analysis import preprocessing as ppc from lang_main.analysis import shared diff --git a/tests/analysis/test_timeline.py b/tests/analysis/test_timeline.py new file mode 100644 index 0000000..374b882 --- /dev/null +++ b/tests/analysis/test_timeline.py @@ -0,0 +1,316 @@ +from pathlib import Path +from typing import cast + +import numpy as np +import pandas as pd +import pytest + +from lang_main import io, model_loader +from lang_main.analysis import timeline as tl +from lang_main.types import STFRModelTypes, TimelineCandidates + + +@pytest.fixture(scope='module') +def data_timeline_filter_activities() -> pd.DataFrame: + pth_data = Path('./tests/_comparison_results/timeline_01_df_filtered.pkl') + return pd.read_pickle(pth_data) + + +@pytest.fixture(scope='module') +def data_timeline_number_activities() -> pd.Series: + pth_data = Path('./tests/_comparison_results/timeline_01_act_per_objid.pkl') + return pd.read_pickle(pth_data) + + +@pytest.fixture(scope='module') +def STFR_model(): + model = model_loader.load_sentence_transformer( + model_name=STFRModelTypes.ALL_MINI_LM_L6_V2, + ) + return model + + +@pytest.fixture(scope='module') +def data_timeline_cands_dict() -> TimelineCandidates: + pth_data = './tests/_comparison_results/timeline_01_timeline_cands_dict.pkl' + tl_cands = cast(TimelineCandidates, io.load_pickle(pth_data)) + return tl_cands + + +def test_cleanup_descriptions(data_pre_cleaned): + data = data_pre_cleaned.copy() + data.at[0, 'VorgangsBeschreibung'] = np.nan + data.at[3, 'VorgangsBeschreibung'] = np.nan + data.at[5, 'ErledigungsBeschreibung'] = np.nan + properties = ('VorgangsBeschreibung', 'ErledigungsBeschreibung') + (data_proc,) = tl.cleanup_descriptions(data, properties=properties) + assert data_proc.at[0, 'VorgangsBeschreibung'] == 'N.V.' + assert data_proc.at[3, 'VorgangsBeschreibung'] == 'N.V.' + assert data_proc.at[5, 'ErledigungsBeschreibung'] == 'N.V.' + + +@pytest.mark.parametrize('convert_to_days', [True, False]) +def test_calc_delta_to_repair(data_pre_cleaned, convert_to_days): + feat_start = 'ErstellungsDatum' + feat_end = 'ErledigungsDatum' + name_delta_feature = 'Test' + (data,) = tl.calc_delta_to_repair( + data_pre_cleaned, + date_feature_start=feat_start, + date_feature_end=feat_end, + name_delta_feature=name_delta_feature, + convert_to_days=convert_to_days, + ) + assert name_delta_feature in data.columns + test_date = data_pre_cleaned.at[0, feat_end] - data_pre_cleaned.at[0, feat_start] + if convert_to_days: + assert test_date.days == data.at[0, name_delta_feature] + else: + assert test_date == data.at[0, name_delta_feature] + + +def test_non_relevant_obj_ids(data_pre_cleaned): + feature_uniqueness = 'HObjektText' + feature_obj_id = 'ObjektID' + threshold = 1 + data = data_pre_cleaned.copy() + data.at[0, feature_obj_id] = 1 + ids_to_ignore = tl._non_relevant_obj_ids( + data, + thresh_unique_feat_per_id=threshold, + feature_uniqueness=feature_uniqueness, + feature_obj_id=feature_obj_id, + ) + assert len(ids_to_ignore) == 1 + assert ids_to_ignore == (1,) + + +def test_remove_non_relevant_obj_ids(data_pre_cleaned): + feature_uniqueness = 'HObjektText' + feature_obj_id = 'ObjektID' + threshold = 1 + data = data_pre_cleaned.copy() + data.at[0, feature_obj_id] = 1 + + (data_proc,) = tl.remove_non_relevant_obj_ids( + data, + thresh_unique_feat_per_id=threshold, + feature_uniqueness=feature_uniqueness, + feature_obj_id=feature_obj_id, + ) + unique_obj_ids = data_proc[feature_obj_id].unique() + assert 1 not in unique_obj_ids + assert len(unique_obj_ids) == 2 + + +def test_generate_model_input(data_pre_cleaned): + target_feature_name = 'nlp_model_input' + model_input_features = ( + 'VorgangsTypName', + 'VorgangsBeschreibung', + ) + data = data_pre_cleaned.copy() + (data_proc,) = tl.generate_model_input( + data, + target_feature_name=target_feature_name, + model_input_features=model_input_features, + ) + feat1 = data.at[0, model_input_features[0]] + feat2 = data.at[0, model_input_features[1]] + test_result = f'{feat1} - {feat2}' + assert data_proc.at[0, target_feature_name] == test_result + + +def test_filter_activities_per_obj_id(data_pre_cleaned): + activity_feature = 'VorgangsTypName' + relevant_activity_types = ('Störungsmeldung',) + feature_obj_id = 'ObjektID' + threshold_num_activities = 1 # at least 2 occurrences per ObjID + + data = data_pre_cleaned.copy() + data = data.iloc[:5] + + df_filtered, act_per_obj_id = tl.filter_activities_per_obj_id( + data, + activity_feature=activity_feature, + relevant_activity_types=relevant_activity_types, + feature_obj_id=feature_obj_id, + threshold_num_activities=threshold_num_activities, + ) + assert len(df_filtered) == 2 + assert df_filtered.iat[0, 1] == act_per_obj_id.index[0] + assert act_per_obj_id.iat[0] == 2 + + +def test_get_timeline_candidates_index( + data_timeline_filter_activities, + data_timeline_number_activities, + STFR_model, +): + data = data_timeline_filter_activities.copy() + target_feature_name = 'nlp_model_input' + model_input_features = ('VorgangsBeschreibung',) + (data,) = tl.generate_model_input( + data, + target_feature_name=target_feature_name, + model_input_features=model_input_features, + ) + data_num_act = data_timeline_number_activities.copy() + cos_sim_threshold = 0.8 + tl_cands_iter = tl._get_timeline_candidates_index( + data=data, + num_activities_per_obj_id=data_num_act, + model=STFR_model, + cos_sim_threshold=cos_sim_threshold, + feature_obj_id='ObjektID', + model_input_feature=target_feature_name, + ) + tl_cands_idx = tuple(tl_cands_iter) # format tuple((ObjID, (Idx1, Idx2))) + assert len(tl_cands_idx) == 0 + + cos_sim_threshold = 0.0 + tl_cands_iter = tl._get_timeline_candidates_index( + data=data, + num_activities_per_obj_id=data_num_act, + model=STFR_model, + cos_sim_threshold=cos_sim_threshold, + feature_obj_id='ObjektID', + model_input_feature=target_feature_name, + ) + tl_cands_idx = tuple(tl_cands_iter) + assert len(tl_cands_idx) == 1 + assert tl_cands_idx == ((3, (np.int64(3), np.int64(4))),) + + +def test_transform_timeline_candidates( + data_timeline_filter_activities, + data_timeline_number_activities, + STFR_model, +): + data = data_timeline_filter_activities.copy() + target_feature_name = 'nlp_model_input' + model_input_features = ('VorgangsBeschreibung',) + (data,) = tl.generate_model_input( + data, + target_feature_name=target_feature_name, + model_input_features=model_input_features, + ) + data_num_act = data_timeline_number_activities.copy() + cos_sim_threshold = 0.0 + tl_cands_iter = tl._get_timeline_candidates_index( + data=data, + num_activities_per_obj_id=data_num_act, + model=STFR_model, + cos_sim_threshold=cos_sim_threshold, + feature_obj_id='ObjektID', + model_input_feature=target_feature_name, + ) + tl_cands_dict = tl._transform_timeline_candidates(tl_cands_iter) + + assert 3 in tl_cands_dict + assert tl_cands_dict[3] == ((np.int64(3), np.int64(4)),) + + +def test_map_obj_id_to_texts(data_pre_cleaned): + data = data_pre_cleaned.iloc[:5].copy() + feature_obj_id = 'ObjektID' + feature_obj_text = 'HObjektText' + map_text = tl._map_obj_id_to_texts( + data=data, + feature_obj_id=feature_obj_id, + feature_obj_text=feature_obj_text, + ) + assert len(map_text) == 3 # three unique IDs + assert map_text[1] == 'Fräsmaschine-FS435X' + assert map_text[2] == 'Schleifmaschine-S4x87' + assert map_text[3] == 'Bohrbearbeitungszentrum-BBZ35' + + +def test_get_timeline_candidates( + data_timeline_filter_activities, + data_timeline_number_activities, + STFR_model, +): + data = data_timeline_filter_activities.copy() + target_feature_name = 'nlp_model_input' + model_input_features = ('VorgangsBeschreibung',) + (data,) = tl.generate_model_input( + data, + target_feature_name=target_feature_name, + model_input_features=model_input_features, + ) + data_num_act = data_timeline_number_activities.copy() + cos_sim_threshold = 0.0 + tl_cands_dict, map_text = tl.get_timeline_candidates( + data=data, + num_activities_per_obj_id=data_num_act, + model=STFR_model, + cos_sim_threshold=cos_sim_threshold, + feature_obj_id='ObjektID', + feature_obj_text='HObjektText', + model_input_feature=target_feature_name, + ) + assert 3 in tl_cands_dict + assert tl_cands_dict[3] == ((np.int64(3), np.int64(4)),) + assert len(map_text) == 1 # three unique IDs + assert map_text[3] == 'Bohrbearbeitungszentrum-BBZ35' + + +def test_filter_timeline_cands(data_pre_cleaned, data_timeline_cands_dict): + obj_id = 3 + entry_idx = 0 + sort_feature = 'ErstellungsDatum' + data_proc = tl.filter_timeline_cands( + data_pre_cleaned, + data_timeline_cands_dict, + obj_id=obj_id, + entry_idx=entry_idx, + sort_feature=sort_feature, + ) + assert 3 in data_proc.index + assert 4 in data_proc.index + assert data_proc.at[3, 'ObjektID'] == 3 + assert data_proc.at[4, 'ObjektID'] == 3 + assert data_proc.at[3, 'VorgangsTypName'] == 'Störungsmeldung' + assert data_proc.at[4, 'VorgangsTypName'] == 'Störungsmeldung' + assert ( + data_proc.at[3, 'ErledigungsBeschreibung'] + == 'Beseitigung der Blockierung und Überprüfung des Antriebs' + ) + assert ( + data_proc.at[4, 'ErledigungsBeschreibung'] + == 'Reinigung der Leitungen und Austausch des Kühlmittels' + ) + + +@pytest.mark.parametrize('convert_to_days', [True, False]) +def test_calc_delta_to_next_failure( + data_pre_cleaned, + data_timeline_cands_dict, + convert_to_days, +): + obj_id = 3 + entry_idx = 0 + sort_feature = 'ErstellungsDatum' + data_tl_filtered = tl.filter_timeline_cands( + data_pre_cleaned, + data_timeline_cands_dict, + obj_id=obj_id, + entry_idx=entry_idx, + sort_feature=sort_feature, + ) + name_delta_feature = 'test_delta' + data_proc = tl.calc_delta_to_next_failure( + data_tl_filtered, + date_feature=sort_feature, + name_delta_feature=name_delta_feature, + convert_to_days=convert_to_days, + ) + test_date = data_proc.at[4, sort_feature] - data_pre_cleaned.at[3, sort_feature] + test_date_last = pd.Timedelta(0) + if convert_to_days: + assert test_date.days == data_proc.at[3, name_delta_feature] + assert test_date_last.days == data_proc.at[4, name_delta_feature] + else: + assert test_date == data_proc.at[3, name_delta_feature] + assert test_date_last == data_proc.at[4, name_delta_feature] diff --git a/tests/conftest.py b/tests/conftest.py index c2f44e6..1d2fce3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,10 @@ from pathlib import Path -from lang_main.analysis import graphs import pandas as pd import pytest +from lang_main.analysis import graphs + DATE_COLS: tuple[str, ...] = ( 'VorgangsDatum', 'ErledigungsDatum', @@ -25,6 +26,12 @@ def raw_data_date_cols(): return DATE_COLS +@pytest.fixture(scope='session') +def data_pre_cleaned() -> pd.DataFrame: + pth_data = Path('./tests/_comparison_results/preprocess_pre_cleaned.pkl') + return pd.read_pickle(pth_data) + + @pytest.fixture(scope='session') def data_analyse_feature() -> pd.DataFrame: pth_data = Path('./tests/_comparison_results/analyse_feature.pkl')