lang-main/scripts/analyse_dataset.py
2024-05-31 09:59:22 +02:00

178 lines
5.9 KiB
Python

import typing
import warnings
from pathlib import Path
from typing import cast
from lang_main.analysis.graphs import TokenGraph
from lang_main.constants import (
DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING,
DO_TIME_ANALYSIS,
DO_TOKEN_ANALYSIS,
INPUT_PATH_FOLDER,
PATH_TO_DATASET,
SAVE_PATH_FOLDER,
SKIP_GRAPH_POSTPROCESSING,
SKIP_PREPROCESSING,
SKIP_TIME_ANALYSIS,
SKIP_TOKEN_ANALYSIS,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
)
from lang_main.io import create_saving_folder, load_pickle
from lang_main.pipelines.predefined import (
pipe_merge,
pipe_target_feat,
pipe_timeline,
pipe_token_analysis,
)
from lang_main.types import (
ObjectID,
TimelineCandidates,
)
from pandas import DataFrame, Series
# ** processing pipeline
def run_preprocessing() -> DataFrame:
create_saving_folder(
saving_path_folder=SAVE_PATH_FOLDER,
overwrite_existing=True,
)
# run pipelines
ret = typing.cast(
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
)
target_feat_data = ret[0]
# only entries with more than threshold amount of characters
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
subset_data = target_feat_data.loc[data_filter].copy()
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
preprocessed_data = ret[0]
return preprocessed_data
def run_token_analysis(
preprocessed_data: DataFrame,
) -> TokenGraph:
# build token graph
(tk_graph,) = typing.cast(
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
)
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph
def run_graph_postprocessing(
tk_graph: TokenGraph,
) -> TokenGraph:
# filter graph by edge weight and remove single nodes (no connection)
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
tk_graph_filtered.save_graph(
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
)
tk_graph_filtered.to_pickle(
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
)
return tk_graph_filtered
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
filename = 'without_nan'
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
ret = load_pickle(loading_path)
preprocessed_data = ret[0]
ret = cast(
tuple[TimelineCandidates, dict[ObjectID, str]],
pipe_timeline.run(starting_values=(preprocessed_data,)),
)
return ret
def verify_path(
loading_path: Path,
) -> None:
if not loading_path.exists():
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
def main() -> None:
pre_step_skipped: bool = False
# ** preprocess
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
preprocessed_data = run_preprocessing()
elif not SKIP_PREPROCESSING:
# !! hardcoded result filenames
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
verify_path(loading_path)
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
preprocessed_data = ret[0]
else:
pre_step_skipped = True
warnings.warn('No preprocessing action selected. Skipped.')
# sys.exit(0)
# ** token analysis
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
if pre_step_skipped:
raise RuntimeError(
'Preprocessing step skipped. Token analysis cannot be performed.'
)
preprocessed_data_trunc = typing.cast(
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
elif not SKIP_TOKEN_ANALYSIS:
# !! hardcoded result filenames
# whole graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
pre_step_skipped = True
warnings.warn('No token analysis action selected. Skipped.')
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
if pre_step_skipped:
raise RuntimeError(
(
'Preprocessing or token analysis step skipped. '
'Graph postprocessing cannot be performed.'
)
)
tk_graph_filtered = run_graph_postprocessing(tk_graph)
elif not SKIP_GRAPH_POSTPROCESSING:
# !! hardcoded result filenames
# filtered graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
warnings.warn('No graph postprocessing action selected. Skipped.')
# ** time analysis
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
# no check for fails, runs separately
ret = run_time_analysis()
elif not SKIP_TIME_ANALYSIS:
...
else:
warnings.warn('No time analysis action selected. Skipped.')
if __name__ == '__main__':
main()