import typing import warnings from pathlib import Path from typing import cast from lang_main.analysis.graphs import TokenGraph from lang_main.constants import ( DO_GRAPH_POSTPROCESSING, DO_PREPROCESSING, DO_TIME_ANALYSIS, DO_TOKEN_ANALYSIS, INPUT_PATH_FOLDER, PATH_TO_DATASET, SAVE_PATH_FOLDER, SKIP_GRAPH_POSTPROCESSING, SKIP_PREPROCESSING, SKIP_TIME_ANALYSIS, SKIP_TOKEN_ANALYSIS, THRESHOLD_AMOUNT_CHARACTERS, THRESHOLD_EDGE_WEIGHT, ) from lang_main.io import create_saving_folder, load_pickle from lang_main.pipelines.predefined import ( pipe_merge, pipe_target_feat, pipe_timeline, pipe_token_analysis, ) from lang_main.types import ( ObjectID, TimelineCandidates, ) from pandas import DataFrame, Series # ** processing pipeline def run_preprocessing() -> DataFrame: create_saving_folder( saving_path_folder=SAVE_PATH_FOLDER, overwrite_existing=True, ) # run pipelines ret = typing.cast( tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)) ) target_feat_data = ret[0] # only entries with more than threshold amount of characters data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) subset_data = target_feat_data.loc[data_filter].copy() # merge duplicates, results saved separately ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,))) preprocessed_data = ret[0] return preprocessed_data def run_token_analysis( preprocessed_data: DataFrame, ) -> TokenGraph: # build token graph (tk_graph,) = typing.cast( tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,)) ) tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False) tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph') return tk_graph def run_graph_postprocessing( tk_graph: TokenGraph, ) -> TokenGraph: # filter graph by edge weight and remove single nodes (no connection) tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT) tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) tk_graph_filtered.save_graph( SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False ) tk_graph_filtered.to_pickle( SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered' ) return tk_graph_filtered def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]: filename = 'without_nan' loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') verify_path(loading_path) ret = load_pickle(loading_path) preprocessed_data = ret[0] ret = cast( tuple[TimelineCandidates, dict[ObjectID, str]], pipe_timeline.run(starting_values=(preprocessed_data,)), ) return ret def verify_path( loading_path: Path, ) -> None: if not loading_path.exists(): raise FileNotFoundError(f'Could not load results. File not found: {loading_path}') def main() -> None: pre_step_skipped: bool = False # ** preprocess if DO_PREPROCESSING and not SKIP_PREPROCESSING: preprocessed_data = run_preprocessing() elif not SKIP_PREPROCESSING: # !! hardcoded result filenames target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*' loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0] verify_path(loading_path) ret = typing.cast(tuple[DataFrame], load_pickle(loading_path)) preprocessed_data = ret[0] else: pre_step_skipped = True warnings.warn('No preprocessing action selected. Skipped.') # sys.exit(0) # ** token analysis if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS: if pre_step_skipped: raise RuntimeError( 'Preprocessing step skipped. Token analysis cannot be performed.' ) preprocessed_data_trunc = typing.cast( DataFrame, preprocessed_data[['entry', 'num_occur']].copy() ) # type: ignore tk_graph = run_token_analysis(preprocessed_data_trunc) elif not SKIP_TOKEN_ANALYSIS: # !! hardcoded result filenames # whole graph filename: str = f'{pipe_token_analysis.name}-TokenGraph' loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') verify_path(loading_path) # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path)) tk_graph = TokenGraph.from_pickle(loading_path) pre_step_skipped = False else: pre_step_skipped = True warnings.warn('No token analysis action selected. Skipped.') # ** graph postprocessing if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING: if pre_step_skipped: raise RuntimeError( ( 'Preprocessing or token analysis step skipped. ' 'Graph postprocessing cannot be performed.' ) ) tk_graph_filtered = run_graph_postprocessing(tk_graph) elif not SKIP_GRAPH_POSTPROCESSING: # !! hardcoded result filenames # filtered graph filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered' loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') verify_path(loading_path) # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path)) tk_graph_filtered = TokenGraph.from_pickle(loading_path) pre_step_skipped = False else: warnings.warn('No graph postprocessing action selected. Skipped.') # ** time analysis if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS: # no check for fails, runs separately ret = run_time_analysis() elif not SKIP_TIME_ANALYSIS: ... else: warnings.warn('No time analysis action selected. Skipped.') if __name__ == '__main__': main()