"""testing each function in a consecutive way like each one is executed in in a pipeline """ from pathlib import Path from lang_main import model_loader from lang_main.analysis import preprocessing as ppc from lang_main.analysis import shared from lang_main.types import LanguageModels, STFRModelTypes def test_load_data(raw_data_path, raw_data_date_cols): (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols) assert len(data) == 1000 def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols): (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols) (data,) = ppc.remove_duplicates(data) assert len(data) == 999 def test_remove_na(raw_data_path, raw_data_date_cols): (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols) (data,) = ppc.remove_duplicates(data) target_features: tuple[str] = ('VorgangsBeschreibung',) (data,) = ppc.remove_NA(data, target_features) assert len(data) == 998 def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols): (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols) (data,) = ppc.remove_duplicates(data) target_features: tuple[str] = ('VorgangsBeschreibung',) (data,) = ppc.remove_NA(data, target_features) starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!' cleaned_string = shared.clean_string_slim(starting_string) target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!' assert cleaned_string == target_string starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!' assert data.at[0, 'VorgangsBeschreibung'] == starting_string (data,) = shared.entry_wise_cleansing( data, target_features=target_features, cleansing_func=shared.clean_string_slim, ) assert data.at[0, 'VorgangsBeschreibung'] == target_string def test_analyse_feature(raw_data_path, raw_data_date_cols): (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols) (data,) = ppc.remove_duplicates(data) target_features: tuple[str] = ('VorgangsBeschreibung',) (data,) = ppc.remove_NA(data, target_features) starting_string = 'Ölleckage durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!' cleaned_string = shared.clean_string_slim(starting_string) target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!' assert cleaned_string == target_string starting_string = 'Ölleckage durch\nundichten Ölsumpf,, aber Dichtung intakt??!!!' assert data.at[0, 'VorgangsBeschreibung'] == starting_string (data,) = shared.entry_wise_cleansing( data, target_features=target_features, cleansing_func=shared.clean_string_slim, ) assert data.at[0, 'VorgangsBeschreibung'] == target_string (data,) = ppc.analyse_feature(data, target_feature=target_features[0]) assert len(data) == 139 def test_numeric_pre_filter_feature(data_analyse_feature, data_numeric_pre_filter_feature): # Dataset contains 139 entries. The feature "len" has a minimum value of 15, # which occurs only once. If all values >= are retained only one entry should be # filtered. This results in a total number of 138 entries. (data,) = ppc.numeric_pre_filter_feature( data=data_analyse_feature, feature='len', bound_lower=16, bound_upper=None, ) assert len(data) == 138 eval_merged = data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']] eval_benchmark = data_numeric_pre_filter_feature[ ['entry', 'len', 'num_occur', 'num_assoc_obj_ids'] ] assert bool((eval_merged == eval_benchmark).all(axis=None)) def test_merge_similarity_duplicates(data_analyse_feature, data_merge_similarity_duplicates): cos_sim_threshold = 0.8 # reduce dataset to 10 entries data = data_analyse_feature.iloc[:10] model = model_loader.load_sentence_transformer( model_name=STFRModelTypes.ALL_MPNET_BASE_V2, ) (merged_data,) = ppc.merge_similarity_duplicates( data=data, model=model, cos_sim_threshold=cos_sim_threshold, ) # constructed use case: with this threshold, # 2 out of 10 entries are merged into one assert len(merged_data) == 9 eval_merged = merged_data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']] eval_benchmark = data_merge_similarity_duplicates[ ['entry', 'len', 'num_occur', 'num_assoc_obj_ids'] ] assert bool((eval_merged == eval_benchmark).all(axis=None))