STRF for similarity duplicates, time analysis pipeline, enhanced config
This commit is contained in:
@@ -1,33 +1,43 @@
|
||||
import typing
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
from ihm_analyse import (
|
||||
SAVE_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
DO_PREPROCESSING,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
from lang_main import (
|
||||
TokenGraph,
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
Embedding,
|
||||
Index,
|
||||
TokenGraph,
|
||||
)
|
||||
from ihm_analyse.predefined_pipes import (
|
||||
pipe_target_feat,
|
||||
pipe_embds,
|
||||
from lang_main.constants import (
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
DO_PREPROCESSING,
|
||||
DO_TIME_ANALYSIS,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
INPUT_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
SAVE_PATH_FOLDER,
|
||||
SKIP_GRAPH_POSTPROCESSING,
|
||||
SKIP_PREPROCESSING,
|
||||
SKIP_TIME_ANALYSIS,
|
||||
SKIP_TOKEN_ANALYSIS,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
)
|
||||
|
||||
# Embedding,
|
||||
# PandasIndex,
|
||||
from lang_main.pipelines.predefined import (
|
||||
pipe_merge,
|
||||
pipe_target_feat,
|
||||
pipe_timeline,
|
||||
pipe_token_analysis,
|
||||
)
|
||||
"""
|
||||
# ** config parameters
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
|
||||
"""
|
||||
from lang_main.types import (
|
||||
ObjectID,
|
||||
TimelineCandidates,
|
||||
)
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
|
||||
# ** processing pipeline
|
||||
def run_preprocessing() -> DataFrame:
|
||||
@@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
|
||||
overwrite_existing=True,
|
||||
)
|
||||
# run pipelines
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
||||
)
|
||||
target_feat_data = ret[0]
|
||||
# only entries with more than threshold amount of characters
|
||||
data_filter = typing.cast(Series,
|
||||
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
dupl_idx_pairs, embds = typing.cast(
|
||||
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
|
||||
pipe_embds.run(starting_values=(subset_data,))
|
||||
)
|
||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
# dupl_idx_pairs, embds = typing.cast(
|
||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||
# pipe_embds.run(starting_values=(subset_data,)),
|
||||
# )
|
||||
# merge duplicates, results saved separately
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
|
||||
subset_data = target_feat_data.loc[data_filter].copy()
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame],
|
||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||
pipe_merge.run(starting_values=(subset_data,)),
|
||||
)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
|
||||
return preprocessed_data
|
||||
|
||||
|
||||
def run_token_analysis(
|
||||
preprocessed_data: DataFrame,
|
||||
) -> TokenGraph:
|
||||
# build token graph
|
||||
(tk_graph,) = typing.cast(tuple[TokenGraph],
|
||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
|
||||
(tk_graph,) = typing.cast(
|
||||
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
|
||||
)
|
||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
|
||||
return tk_graph
|
||||
|
||||
|
||||
def run_graph_postprocessing(
|
||||
tk_graph: TokenGraph,
|
||||
) -> TokenGraph:
|
||||
# filter graph by edge weight and remove single nodes (no connection)
|
||||
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
||||
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
||||
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
|
||||
filename='TokenGraph-filtered',
|
||||
directed=False)
|
||||
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
|
||||
|
||||
tk_graph_filtered.save_graph(
|
||||
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
||||
)
|
||||
tk_graph_filtered.to_pickle(
|
||||
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
)
|
||||
|
||||
return tk_graph_filtered
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
filename = 'without_nan'
|
||||
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
ret = load_pickle(loading_path)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
ret = cast(
|
||||
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def verify_path(
|
||||
loading_path: Path,
|
||||
) -> None:
|
||||
if not loading_path.exists():
|
||||
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
|
||||
|
||||
|
||||
def main() -> None:
|
||||
pre_step_skipped: bool = False
|
||||
# ** preprocess
|
||||
if DO_PREPROCESSING:
|
||||
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
|
||||
preprocessed_data = run_preprocessing()
|
||||
else:
|
||||
elif not SKIP_PREPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
||||
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
load_pickle(target_filepath))
|
||||
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
verify_path(loading_path)
|
||||
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
|
||||
preprocessed_data = ret[0]
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS:
|
||||
preprocessed_data_trunc = typing.cast(DataFrame,
|
||||
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No preprocessing action selected. Skipped.')
|
||||
# sys.exit(0)
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
'Preprocessing step skipped. Token analysis cannot be performed.'
|
||||
)
|
||||
preprocessed_data_trunc = typing.cast(
|
||||
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
|
||||
) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
elif not SKIP_TOKEN_ANALYSIS:
|
||||
# !! hardcoded result filenames
|
||||
# whole graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
#tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph = TokenGraph.from_pickle(loading_path)
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING:
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No token analysis action selected. Skipped.')
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
(
|
||||
'Preprocessing or token analysis step skipped. '
|
||||
'Graph postprocessing cannot be performed.'
|
||||
)
|
||||
)
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
elif not SKIP_GRAPH_POSTPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
# filtered graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
#tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
warnings.warn('No graph postprocessing action selected. Skipped.')
|
||||
# ** time analysis
|
||||
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
|
||||
# no check for fails, runs separately
|
||||
ret = run_time_analysis()
|
||||
elif not SKIP_TIME_ANALYSIS:
|
||||
...
|
||||
else:
|
||||
warnings.warn('No time analysis action selected. Skipped.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
BIN
scripts/inputs/without_nan.pkl
Normal file
BIN
scripts/inputs/without_nan.pkl
Normal file
Binary file not shown.
38
scripts/lang_main_config copy.toml
Normal file
38
scripts/lang_main_config copy.toml
Normal file
@@ -0,0 +1,38 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis]
|
||||
threshold_unique_texts = 5
|
||||
59
scripts/lang_main_config.toml
Normal file
59
scripts/lang_main_config.toml
Normal file
@@ -0,0 +1,59 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = true
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
time_analysis = true
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
12
scripts/test.py
Normal file
12
scripts/test.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from lang_main.analysis.preprocessing import clean_string_slim
|
||||
from lang_main.constants import SAVE_PATH_FOLDER
|
||||
|
||||
print(SAVE_PATH_FOLDER)
|
||||
txt = """
|
||||
Wir feiern den Jahrestag, olé!
|
||||
tel:::: !!!!???? +++49 123 456 789
|
||||
|
||||
Doch leben wir länger.
|
||||
"""
|
||||
print(txt)
|
||||
print(clean_string_slim(txt))
|
||||
Reference in New Issue
Block a user