from sentence_transformers import SentenceTransformer import spacy from lang_main import ( SAVE_PATH_FOLDER, DATE_COLS, FILENAME_COSSIM_FILTER_CANDIDATES, THRESHOLD_SIMILARITY, ) from lang_main.pipelines.base import BasePipeline from lang_main.analysis.preprocessing import ( load_raw_data, remove_duplicates, remove_NA, clean_string_slim, entry_wise_cleansing, analyse_feature, build_cosSim_matrix, filt_thresh_cosSim_matrix, list_cosSim_dupl_candidates, merge_similarity_dupl, ) from lang_main.analysis.tokens import build_token_graph """ # ** config parameters SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols'] FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\ CONFIG['export_filenames']['filename_cossim_filter_candidates'] THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'] """ # ** pipeline configuration # ** target feature preparation pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER) pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS}) pipe_target_feat.add(remove_duplicates) pipe_target_feat.add(remove_NA, save_result=True) pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim}) pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True) # output: DataFrame containing target feature with # number of occurrences and associated ObjectIDs # ** embedding pipe # using similarity between entries to catch duplicates with typo or similar content pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER) model_spacy = spacy.load('de_dep_news_trf') model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True) pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True) pipe_embds.add( list_cosSim_dupl_candidates, {'save_candidates': True, 'saving_path': SAVE_PATH_FOLDER, 'filename': FILENAME_COSSIM_FILTER_CANDIDATES, 'pipeline': pipe_embds}, save_result=True) # ** Merge duplicates pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER) pipe_merge.add(merge_similarity_dupl, save_result=True) # ** token analysis pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER) pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)