Florian Förster 9edcd5be4e initial commit
2024-05-08 14:46:43 +02:00

67 lines
2.6 KiB
Python

from sentence_transformers import SentenceTransformer
import spacy
from lang_main import (
SAVE_PATH_FOLDER,
DATE_COLS,
FILENAME_COSSIM_FILTER_CANDIDATES,
THRESHOLD_SIMILARITY,
)
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.preprocessing import (
load_raw_data,
remove_duplicates,
remove_NA,
clean_string_slim,
entry_wise_cleansing,
analyse_feature,
build_cosSim_matrix,
filt_thresh_cosSim_matrix,
list_cosSim_dupl_candidates,
merge_similarity_dupl,
)
from lang_main.analysis.tokens import build_token_graph
"""
# ** config parameters
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
CONFIG['export_filenames']['filename_cossim_filter_candidates']
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
"""
# ** pipeline configuration
# ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
pipe_target_feat.add(remove_duplicates)
pipe_target_feat.add(remove_NA, save_result=True)
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
# output: DataFrame containing target feature with
# number of occurrences and associated ObjectIDs
# ** embedding pipe
# using similarity between entries to catch duplicates with typo or similar content
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
model_spacy = spacy.load('de_dep_news_trf')
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
pipe_embds.add(
list_cosSim_dupl_candidates,
{'save_candidates': True,
'saving_path': SAVE_PATH_FOLDER,
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
'pipeline': pipe_embds}, save_result=True)
# ** Merge duplicates
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
pipe_merge.add(merge_similarity_dupl, save_result=True)
# ** token analysis
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)