import typing from pandas import DataFrame, Series from ihm_analyse import ( SAVE_PATH_FOLDER, PATH_TO_DATASET, THRESHOLD_AMOUNT_CHARACTERS, THRESHOLD_EDGE_WEIGHT, DO_PREPROCESSING, DO_TOKEN_ANALYSIS, DO_GRAPH_POSTPROCESSING, create_saving_folder, load_pickle, Embedding, Index, TokenGraph, ) from ihm_analyse.predefined_pipes import ( pipe_target_feat, pipe_embds, pipe_merge, pipe_token_analysis, ) """ # ** config parameters SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset']) THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters'] """ # ** processing pipeline def run_preprocessing() -> DataFrame: create_saving_folder( saving_path_folder=SAVE_PATH_FOLDER, overwrite_existing=True, ) # run pipelines ret = typing.cast(tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))) target_feat_data = ret[0] # only entries with more than threshold amount of characters data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) subset_data = target_feat_data.loc[data_filter, 'entry'].copy() dupl_idx_pairs, embds = typing.cast( tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]], pipe_embds.run(starting_values=(subset_data,)) ) # merge duplicates, results saved separately ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs))) preprocessed_data = ret[0] return preprocessed_data def run_token_analysis( preprocessed_data: DataFrame, ) -> TokenGraph: # build token graph (tk_graph,) = typing.cast(tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))) tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False) tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph') return tk_graph def run_graph_postprocessing( tk_graph: TokenGraph, ) -> TokenGraph: # filter graph by edge weight and remove single nodes (no connection) tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT) tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) tk_graph_filtered.save_graph(SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False) tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered') return tk_graph_filtered if __name__ == '__main__': # ** preprocess if DO_PREPROCESSING: preprocessed_data = run_preprocessing() else: # !! hardcoded result filenames target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*' target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0] ret = typing.cast(tuple[DataFrame], load_pickle(target_filepath)) preprocessed_data = ret[0] # ** token analysis if DO_TOKEN_ANALYSIS: preprocessed_data_trunc = typing.cast(DataFrame, preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore tk_graph = run_token_analysis(preprocessed_data_trunc) else: # !! hardcoded result filenames # whole graph filename: str = f'{pipe_token_analysis.name}-TokenGraph' loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') #tk_graph = typing.cast(TokenGraph, load_pickle(loading_path)) tk_graph = TokenGraph.from_pickle(loading_path) # ** graph postprocessing if DO_GRAPH_POSTPROCESSING: tk_graph_filtered = run_graph_postprocessing(tk_graph) else: # !! hardcoded result filenames # filtered graph filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered' loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') #tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path)) tk_graph_filtered = TokenGraph.from_pickle(loading_path)