67 lines
2.6 KiB
Python
67 lines
2.6 KiB
Python
from sentence_transformers import SentenceTransformer
|
|
import spacy
|
|
|
|
from lang_main import (
|
|
SAVE_PATH_FOLDER,
|
|
DATE_COLS,
|
|
FILENAME_COSSIM_FILTER_CANDIDATES,
|
|
THRESHOLD_SIMILARITY,
|
|
)
|
|
from lang_main.pipelines.base import BasePipeline
|
|
from lang_main.analysis.preprocessing import (
|
|
load_raw_data,
|
|
remove_duplicates,
|
|
remove_NA,
|
|
clean_string_slim,
|
|
entry_wise_cleansing,
|
|
analyse_feature,
|
|
build_cosSim_matrix,
|
|
filt_thresh_cosSim_matrix,
|
|
list_cosSim_dupl_candidates,
|
|
merge_similarity_dupl,
|
|
)
|
|
from lang_main.analysis.tokens import build_token_graph
|
|
|
|
"""
|
|
# ** config parameters
|
|
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
|
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
|
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
|
CONFIG['export_filenames']['filename_cossim_filter_candidates']
|
|
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
|
"""
|
|
|
|
# ** pipeline configuration
|
|
# ** target feature preparation
|
|
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
|
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
|
|
pipe_target_feat.add(remove_duplicates)
|
|
pipe_target_feat.add(remove_NA, save_result=True)
|
|
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
|
|
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
|
|
# output: DataFrame containing target feature with
|
|
# number of occurrences and associated ObjectIDs
|
|
|
|
# ** embedding pipe
|
|
# using similarity between entries to catch duplicates with typo or similar content
|
|
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
|
model_spacy = spacy.load('de_dep_news_trf')
|
|
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
|
|
|
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
|
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
|
|
pipe_embds.add(
|
|
list_cosSim_dupl_candidates,
|
|
{'save_candidates': True,
|
|
'saving_path': SAVE_PATH_FOLDER,
|
|
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
|
'pipeline': pipe_embds}, save_result=True)
|
|
|
|
# ** Merge duplicates
|
|
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
|
pipe_merge.add(merge_similarity_dupl, save_result=True)
|
|
|
|
# ** token analysis
|
|
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
|
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
|