lang-main/scripts/analyse_dataset.py
Florian Förster 9edcd5be4e initial commit
2024-05-08 14:46:43 +02:00

115 lines
4.3 KiB
Python

import typing
from pandas import DataFrame, Series
from ihm_analyse import (
SAVE_PATH_FOLDER,
PATH_TO_DATASET,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
DO_PREPROCESSING,
DO_TOKEN_ANALYSIS,
DO_GRAPH_POSTPROCESSING,
create_saving_folder,
load_pickle,
Embedding,
Index,
TokenGraph,
)
from ihm_analyse.predefined_pipes import (
pipe_target_feat,
pipe_embds,
pipe_merge,
pipe_token_analysis,
)
"""
# ** config parameters
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
"""
# ** processing pipeline
def run_preprocessing() -> DataFrame:
create_saving_folder(
saving_path_folder=SAVE_PATH_FOLDER,
overwrite_existing=True,
)
# run pipelines
ret = typing.cast(tuple[DataFrame],
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
target_feat_data = ret[0]
# only entries with more than threshold amount of characters
data_filter = typing.cast(Series,
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
dupl_idx_pairs, embds = typing.cast(
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
pipe_embds.run(starting_values=(subset_data,))
)
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame],
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
preprocessed_data = ret[0]
return preprocessed_data
def run_token_analysis(
preprocessed_data: DataFrame,
) -> TokenGraph:
# build token graph
(tk_graph,) = typing.cast(tuple[TokenGraph],
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER,
filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph
def run_graph_postprocessing(
tk_graph: TokenGraph,
) -> TokenGraph:
# filter graph by edge weight and remove single nodes (no connection)
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
filename='TokenGraph-filtered',
directed=False)
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
return tk_graph_filtered
if __name__ == '__main__':
# ** preprocess
if DO_PREPROCESSING:
preprocessed_data = run_preprocessing()
else:
# !! hardcoded result filenames
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
ret = typing.cast(tuple[DataFrame],
load_pickle(target_filepath))
preprocessed_data = ret[0]
# ** token analysis
if DO_TOKEN_ANALYSIS:
preprocessed_data_trunc = typing.cast(DataFrame,
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
else:
# !! hardcoded result filenames
# whole graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
#tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph = TokenGraph.from_pickle(loading_path)
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING:
tk_graph_filtered = run_graph_postprocessing(tk_graph)
else:
# !! hardcoded result filenames
# filtered graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
#tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph_filtered = TokenGraph.from_pickle(loading_path)