STRF for similarity duplicates, time analysis pipeline, enhanced config
This commit is contained in:
parent
5d2c97165a
commit
bb987e2108
@ -34,3 +34,15 @@ trials = [
|
|||||||
"plotly>=5.22.0",
|
"plotly>=5.22.0",
|
||||||
"dash>=2.17.0",
|
"dash>=2.17.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 94
|
||||||
|
indent-width = 4
|
||||||
|
target-version = "py311"
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
quote-style = "single"
|
||||||
|
skip-magic-trailing-comma = false
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I"]
|
||||||
@ -1,33 +1,43 @@
|
|||||||
import typing
|
import typing
|
||||||
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
from pandas import DataFrame, Series
|
from lang_main import (
|
||||||
|
TokenGraph,
|
||||||
from ihm_analyse import (
|
|
||||||
SAVE_PATH_FOLDER,
|
|
||||||
PATH_TO_DATASET,
|
|
||||||
THRESHOLD_AMOUNT_CHARACTERS,
|
|
||||||
THRESHOLD_EDGE_WEIGHT,
|
|
||||||
DO_PREPROCESSING,
|
|
||||||
DO_TOKEN_ANALYSIS,
|
|
||||||
DO_GRAPH_POSTPROCESSING,
|
|
||||||
create_saving_folder,
|
create_saving_folder,
|
||||||
load_pickle,
|
load_pickle,
|
||||||
Embedding,
|
|
||||||
Index,
|
|
||||||
TokenGraph,
|
|
||||||
)
|
)
|
||||||
from ihm_analyse.predefined_pipes import (
|
from lang_main.constants import (
|
||||||
pipe_target_feat,
|
DO_GRAPH_POSTPROCESSING,
|
||||||
pipe_embds,
|
DO_PREPROCESSING,
|
||||||
|
DO_TIME_ANALYSIS,
|
||||||
|
DO_TOKEN_ANALYSIS,
|
||||||
|
INPUT_PATH_FOLDER,
|
||||||
|
PATH_TO_DATASET,
|
||||||
|
SAVE_PATH_FOLDER,
|
||||||
|
SKIP_GRAPH_POSTPROCESSING,
|
||||||
|
SKIP_PREPROCESSING,
|
||||||
|
SKIP_TIME_ANALYSIS,
|
||||||
|
SKIP_TOKEN_ANALYSIS,
|
||||||
|
THRESHOLD_AMOUNT_CHARACTERS,
|
||||||
|
THRESHOLD_EDGE_WEIGHT,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Embedding,
|
||||||
|
# PandasIndex,
|
||||||
|
from lang_main.pipelines.predefined import (
|
||||||
pipe_merge,
|
pipe_merge,
|
||||||
|
pipe_target_feat,
|
||||||
|
pipe_timeline,
|
||||||
pipe_token_analysis,
|
pipe_token_analysis,
|
||||||
)
|
)
|
||||||
"""
|
from lang_main.types import (
|
||||||
# ** config parameters
|
ObjectID,
|
||||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
TimelineCandidates,
|
||||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
)
|
||||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
|
from pandas import DataFrame, Series
|
||||||
"""
|
|
||||||
|
|
||||||
# ** processing pipeline
|
# ** processing pipeline
|
||||||
def run_preprocessing() -> DataFrame:
|
def run_preprocessing() -> DataFrame:
|
||||||
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
|
|||||||
overwrite_existing=True,
|
overwrite_existing=True,
|
||||||
)
|
)
|
||||||
# run pipelines
|
# run pipelines
|
||||||
ret = typing.cast(tuple[DataFrame],
|
ret = typing.cast(
|
||||||
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
|
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
||||||
|
)
|
||||||
target_feat_data = ret[0]
|
target_feat_data = ret[0]
|
||||||
# only entries with more than threshold amount of characters
|
# only entries with more than threshold amount of characters
|
||||||
data_filter = typing.cast(Series,
|
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||||
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||||
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
# dupl_idx_pairs, embds = typing.cast(
|
||||||
dupl_idx_pairs, embds = typing.cast(
|
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||||
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
|
# pipe_embds.run(starting_values=(subset_data,)),
|
||||||
pipe_embds.run(starting_values=(subset_data,))
|
# )
|
||||||
)
|
|
||||||
# merge duplicates, results saved separately
|
# merge duplicates, results saved separately
|
||||||
ret = typing.cast(tuple[DataFrame],
|
subset_data = target_feat_data.loc[data_filter].copy()
|
||||||
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
|
ret = typing.cast(
|
||||||
|
tuple[DataFrame],
|
||||||
|
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||||
|
pipe_merge.run(starting_values=(subset_data,)),
|
||||||
|
)
|
||||||
preprocessed_data = ret[0]
|
preprocessed_data = ret[0]
|
||||||
|
|
||||||
return preprocessed_data
|
return preprocessed_data
|
||||||
|
|
||||||
|
|
||||||
def run_token_analysis(
|
def run_token_analysis(
|
||||||
preprocessed_data: DataFrame,
|
preprocessed_data: DataFrame,
|
||||||
) -> TokenGraph:
|
) -> TokenGraph:
|
||||||
# build token graph
|
# build token graph
|
||||||
(tk_graph,) = typing.cast(tuple[TokenGraph],
|
(tk_graph,) = typing.cast(
|
||||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
|
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
|
||||||
|
)
|
||||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
||||||
tk_graph.to_pickle(SAVE_PATH_FOLDER,
|
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||||
filename=f'{pipe_token_analysis.name}-TokenGraph')
|
|
||||||
|
|
||||||
return tk_graph
|
return tk_graph
|
||||||
|
|
||||||
|
|
||||||
def run_graph_postprocessing(
|
def run_graph_postprocessing(
|
||||||
tk_graph: TokenGraph,
|
tk_graph: TokenGraph,
|
||||||
) -> TokenGraph:
|
) -> TokenGraph:
|
||||||
# filter graph by edge weight and remove single nodes (no connection)
|
# filter graph by edge weight and remove single nodes (no connection)
|
||||||
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
||||||
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
||||||
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
|
tk_graph_filtered.save_graph(
|
||||||
filename='TokenGraph-filtered',
|
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
||||||
directed=False)
|
)
|
||||||
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
|
tk_graph_filtered.to_pickle(
|
||||||
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
|
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||||
|
)
|
||||||
|
|
||||||
return tk_graph_filtered
|
return tk_graph_filtered
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||||
|
filename = 'without_nan'
|
||||||
|
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||||
|
verify_path(loading_path)
|
||||||
|
ret = load_pickle(loading_path)
|
||||||
|
preprocessed_data = ret[0]
|
||||||
|
|
||||||
|
ret = cast(
|
||||||
|
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||||
|
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||||
|
)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def verify_path(
|
||||||
|
loading_path: Path,
|
||||||
|
) -> None:
|
||||||
|
if not loading_path.exists():
|
||||||
|
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pre_step_skipped: bool = False
|
||||||
# ** preprocess
|
# ** preprocess
|
||||||
if DO_PREPROCESSING:
|
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
|
||||||
preprocessed_data = run_preprocessing()
|
preprocessed_data = run_preprocessing()
|
||||||
else:
|
elif not SKIP_PREPROCESSING:
|
||||||
# !! hardcoded result filenames
|
# !! hardcoded result filenames
|
||||||
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
||||||
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||||
ret = typing.cast(tuple[DataFrame],
|
verify_path(loading_path)
|
||||||
load_pickle(target_filepath))
|
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
|
||||||
preprocessed_data = ret[0]
|
preprocessed_data = ret[0]
|
||||||
# ** token analysis
|
|
||||||
if DO_TOKEN_ANALYSIS:
|
|
||||||
preprocessed_data_trunc = typing.cast(DataFrame,
|
|
||||||
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
|
|
||||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
|
||||||
else:
|
else:
|
||||||
|
pre_step_skipped = True
|
||||||
|
warnings.warn('No preprocessing action selected. Skipped.')
|
||||||
|
# sys.exit(0)
|
||||||
|
# ** token analysis
|
||||||
|
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
|
||||||
|
if pre_step_skipped:
|
||||||
|
raise RuntimeError(
|
||||||
|
'Preprocessing step skipped. Token analysis cannot be performed.'
|
||||||
|
)
|
||||||
|
preprocessed_data_trunc = typing.cast(
|
||||||
|
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
|
||||||
|
) # type: ignore
|
||||||
|
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||||
|
elif not SKIP_TOKEN_ANALYSIS:
|
||||||
# !! hardcoded result filenames
|
# !! hardcoded result filenames
|
||||||
# whole graph
|
# whole graph
|
||||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
||||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||||
#tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
verify_path(loading_path)
|
||||||
|
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||||
tk_graph = TokenGraph.from_pickle(loading_path)
|
tk_graph = TokenGraph.from_pickle(loading_path)
|
||||||
# ** graph postprocessing
|
pre_step_skipped = False
|
||||||
if DO_GRAPH_POSTPROCESSING:
|
|
||||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
|
||||||
else:
|
else:
|
||||||
|
pre_step_skipped = True
|
||||||
|
warnings.warn('No token analysis action selected. Skipped.')
|
||||||
|
# ** graph postprocessing
|
||||||
|
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
|
||||||
|
if pre_step_skipped:
|
||||||
|
raise RuntimeError(
|
||||||
|
(
|
||||||
|
'Preprocessing or token analysis step skipped. '
|
||||||
|
'Graph postprocessing cannot be performed.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||||
|
elif not SKIP_GRAPH_POSTPROCESSING:
|
||||||
# !! hardcoded result filenames
|
# !! hardcoded result filenames
|
||||||
# filtered graph
|
# filtered graph
|
||||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||||
#tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
verify_path(loading_path)
|
||||||
|
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||||
|
pre_step_skipped = False
|
||||||
|
else:
|
||||||
|
warnings.warn('No graph postprocessing action selected. Skipped.')
|
||||||
|
# ** time analysis
|
||||||
|
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
|
||||||
|
# no check for fails, runs separately
|
||||||
|
ret = run_time_analysis()
|
||||||
|
elif not SKIP_TIME_ANALYSIS:
|
||||||
|
...
|
||||||
|
else:
|
||||||
|
warnings.warn('No time analysis action selected. Skipped.')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|||||||
BIN
scripts/inputs/without_nan.pkl
Normal file
BIN
scripts/inputs/without_nan.pkl
Normal file
Binary file not shown.
@ -1,17 +1,21 @@
|
|||||||
# lang_main: Config file
|
# lang_main: Config file
|
||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
results = './results/test_new2/'
|
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
|
||||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||||
|
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||||
#results = './results/Export7/'
|
#results = './results/Export7/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
#results = './results/Export7_trunc/'
|
#results = './results/Export7_trunc/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
[control]
|
[control]
|
||||||
preprocessing = false
|
preprocessing = true
|
||||||
token_analysis = true
|
preprocessing_skip = false
|
||||||
|
token_analysis = false
|
||||||
|
token_analysis_skip = true
|
||||||
graph_postprocessing = false
|
graph_postprocessing = false
|
||||||
|
graph_postprocessing_skip = true
|
||||||
|
|
||||||
#[export_filenames]
|
#[export_filenames]
|
||||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
59
scripts/lang_main_config.toml
Normal file
59
scripts/lang_main_config.toml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
|
||||||
|
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||||
|
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||||
|
#results = './results/Export7/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
|
#results = './results/Export7_trunc/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
[control]
|
||||||
|
preprocessing = true
|
||||||
|
preprocessing_skip = true
|
||||||
|
token_analysis = false
|
||||||
|
token_analysis_skip = true
|
||||||
|
graph_postprocessing = false
|
||||||
|
graph_postprocessing_skip = true
|
||||||
|
time_analysis = true
|
||||||
|
time_analysis_skip = false
|
||||||
|
|
||||||
|
#[export_filenames]
|
||||||
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
# input_features = [
|
||||||
|
# 'VorgangsTypName',
|
||||||
|
# 'VorgangsArtText',
|
||||||
|
# 'VorgangsBeschreibung',
|
||||||
|
# ]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
12
scripts/test.py
Normal file
12
scripts/test.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from lang_main.analysis.preprocessing import clean_string_slim
|
||||||
|
from lang_main.constants import SAVE_PATH_FOLDER
|
||||||
|
|
||||||
|
print(SAVE_PATH_FOLDER)
|
||||||
|
txt = """
|
||||||
|
Wir feiern den Jahrestag, olé!
|
||||||
|
tel:::: !!!!???? +++49 123 456 789
|
||||||
|
|
||||||
|
Doch leben wir länger.
|
||||||
|
"""
|
||||||
|
print(txt)
|
||||||
|
print(clean_string_slim(txt))
|
||||||
@ -1,18 +1,19 @@
|
|||||||
from typing import Final, Any
|
|
||||||
import inspect
|
import inspect
|
||||||
import sys
|
|
||||||
import logging
|
import logging
|
||||||
from time import gmtime
|
import shutil
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from time import gmtime
|
||||||
|
from typing import Any, Final
|
||||||
|
|
||||||
from lang_main.shared import (
|
|
||||||
save_pickle,
|
|
||||||
load_pickle,
|
|
||||||
create_saving_folder,
|
|
||||||
load_toml_config,
|
|
||||||
)
|
|
||||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
|
||||||
from lang_main.analysis.graphs import TokenGraph
|
from lang_main.analysis.graphs import TokenGraph
|
||||||
|
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||||
|
from lang_main.shared import (
|
||||||
|
create_saving_folder,
|
||||||
|
load_pickle,
|
||||||
|
load_toml_config,
|
||||||
|
save_pickle,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'save_pickle',
|
'save_pickle',
|
||||||
@ -32,37 +33,30 @@ logging.basicConfig(
|
|||||||
datefmt=LOG_DATE_FMT,
|
datefmt=LOG_DATE_FMT,
|
||||||
)
|
)
|
||||||
|
|
||||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||||
|
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||||
|
|
||||||
|
pkg_dir = Path(__file__).parent
|
||||||
|
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||||
|
|
||||||
# load config data: internal/external
|
# load config data: internal/external
|
||||||
if USE_INTERNAL_CONFIG:
|
if USE_INTERNAL_CONFIG:
|
||||||
curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||||
pkg_dir = curr_file_dir.parent
|
|
||||||
config_path = Path(pkg_dir, 'config.toml')
|
|
||||||
loaded_config = load_toml_config(path_to_toml=config_path)
|
|
||||||
CONFIG: Final[dict[str, Any]] = loaded_config.copy()
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("External config data not implemented yet.")
|
caller_file = Path(inspect.stack()[-1].filename)
|
||||||
|
if not caller_file.exists():
|
||||||
|
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
||||||
|
cfg_path_external = caller_file.parent / CONFIG_FILENAME
|
||||||
|
if not cfg_path_external.exists():
|
||||||
|
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||||
|
sys.exit(
|
||||||
|
(
|
||||||
|
'No config file was found. A new one with default values was created '
|
||||||
|
'in the execution path. Please fill in the necessary values and '
|
||||||
|
'restart the programm.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# raise NotImplementedError("External config data not implemented yet.")
|
||||||
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||||
|
|
||||||
# ** paths
|
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
|
||||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
|
||||||
# ** control
|
|
||||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
|
||||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
|
||||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
|
||||||
# ** export
|
|
||||||
|
|
||||||
# ** preprocessing
|
|
||||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
|
||||||
CONFIG['preprocess']['filename_cossim_filter_candidates']
|
|
||||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
|
||||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
|
|
||||||
CONFIG['preprocess']['threshold_amount_characters']
|
|
||||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
|
||||||
# ** token analysis
|
|
||||||
|
|
||||||
# ** graph postprocessing
|
|
||||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
|
||||||
# ** time analysis
|
|
||||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
|
|
||||||
|
|||||||
@ -1,18 +1,18 @@
|
|||||||
import typing
|
|
||||||
from typing import Any, Self, Literal, overload, Final
|
|
||||||
import sys
|
|
||||||
from collections.abc import Hashable
|
|
||||||
from pathlib import Path
|
|
||||||
import copy
|
import copy
|
||||||
|
import sys
|
||||||
|
import typing
|
||||||
|
from collections.abc import Hashable, Iterable
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Final, Literal, Self, overload
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
from networkx import Graph, DiGraph
|
from networkx import DiGraph, Graph
|
||||||
import networkx as nx
|
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
|
||||||
from lang_main.loggers import logger_graphs as logger
|
from lang_main.loggers import logger_graphs as logger
|
||||||
from lang_main.shared import save_pickle, load_pickle
|
from lang_main.shared import load_pickle, save_pickle
|
||||||
|
|
||||||
# TODO change logging behaviour, add logging to file
|
# TODO change logging behaviour, add logging to file
|
||||||
LOGGING_DEFAULT: Final[bool] = False
|
LOGGING_DEFAULT: Final[bool] = False
|
||||||
@ -31,8 +31,7 @@ def get_graph_metadata(
|
|||||||
min_edge_weight: int = 1_000_000
|
min_edge_weight: int = 1_000_000
|
||||||
max_edge_weight: int = 0
|
max_edge_weight: int = 0
|
||||||
for edge in graph.edges:
|
for edge in graph.edges:
|
||||||
weight = typing.cast(int,
|
weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
|
||||||
graph[edge[0]][edge[1]]['weight'])
|
|
||||||
if weight < min_edge_weight:
|
if weight < min_edge_weight:
|
||||||
min_edge_weight = weight
|
min_edge_weight = weight
|
||||||
if weight > max_edge_weight:
|
if weight > max_edge_weight:
|
||||||
@ -54,18 +53,20 @@ def get_graph_metadata(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if logging:
|
if logging:
|
||||||
logger.info((f"Graph properties: {num_nodes} Nodes, "
|
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
|
||||||
f"{num_edges} Edges"))
|
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
|
||||||
logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
|
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
|
||||||
logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
|
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
|
||||||
logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
|
|
||||||
|
|
||||||
return graph_info
|
return graph_info
|
||||||
|
|
||||||
|
|
||||||
def update_graph(
|
def update_graph(
|
||||||
graph: Graph | DiGraph,
|
graph: Graph | DiGraph,
|
||||||
parent: Hashable,
|
*,
|
||||||
child: Hashable,
|
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
|
||||||
|
parent: Hashable | None = None,
|
||||||
|
child: Hashable | None = None,
|
||||||
weight_connection: int = 1,
|
weight_connection: int = 1,
|
||||||
) -> None:
|
) -> None:
|
||||||
# !! not necessary to check for existence of nodes
|
# !! not necessary to check for existence of nodes
|
||||||
@ -78,7 +79,9 @@ def update_graph(
|
|||||||
graph.add_node(child)
|
graph.add_node(child)
|
||||||
"""
|
"""
|
||||||
# check if edge not in Graph
|
# check if edge not in Graph
|
||||||
if not graph.has_edge(parent, child):
|
if batch is not None:
|
||||||
|
graph.add_edges_from(batch, weight=weight_connection)
|
||||||
|
elif not graph.has_edge(parent, child):
|
||||||
# create new edge, nodes will be created if not already present
|
# create new edge, nodes will be created if not already present
|
||||||
graph.add_edge(parent, child, weight=weight_connection)
|
graph.add_edge(parent, child, weight=weight_connection)
|
||||||
else:
|
else:
|
||||||
@ -87,16 +90,15 @@ def update_graph(
|
|||||||
weight += weight_connection
|
weight += weight_connection
|
||||||
graph[parent][child]['weight'] = weight
|
graph[parent][child]['weight'] = weight
|
||||||
|
|
||||||
|
|
||||||
# build undirected adjacency matrix
|
# build undirected adjacency matrix
|
||||||
def convert_graph_to_undirected(
|
def convert_graph_to_undirected(
|
||||||
graph: DiGraph,
|
graph: DiGraph,
|
||||||
logging: bool = LOGGING_DEFAULT,
|
logging: bool = LOGGING_DEFAULT,
|
||||||
) -> Graph:
|
) -> Graph:
|
||||||
# get adjacency matrix
|
# get adjacency matrix
|
||||||
adj_mat = typing.cast(DataFrame,
|
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||||
nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
|
||||||
arr = typing.cast(npt.NDArray[np.uint32],
|
|
||||||
adj_mat.to_numpy())
|
|
||||||
# build undirected array: adding edges of lower triangular matrix to upper one
|
# build undirected array: adding edges of lower triangular matrix to upper one
|
||||||
arr_upper = np.triu(arr)
|
arr_upper = np.triu(arr)
|
||||||
arr_lower = np.tril(arr)
|
arr_lower = np.tril(arr)
|
||||||
@ -104,23 +106,22 @@ def convert_graph_to_undirected(
|
|||||||
arr_new = arr_upper + arr_lower
|
arr_new = arr_upper + arr_lower
|
||||||
# assign new data and create graph
|
# assign new data and create graph
|
||||||
adj_mat.loc[:] = arr_new # type: ignore
|
adj_mat.loc[:] = arr_new # type: ignore
|
||||||
graph_undir = typing.cast(Graph,
|
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
|
||||||
nx.from_pandas_adjacency(df=adj_mat))
|
|
||||||
|
|
||||||
# info about graph
|
# info about graph
|
||||||
if logging:
|
if logging:
|
||||||
logger.info("Successfully converted graph to one with undirected edges.")
|
logger.info('Successfully converted graph to one with undirected edges.')
|
||||||
_ = get_graph_metadata(graph=graph_undir, logging=logging)
|
_ = get_graph_metadata(graph=graph_undir, logging=logging)
|
||||||
|
|
||||||
return graph_undir
|
return graph_undir
|
||||||
|
|
||||||
class TokenGraph(DiGraph):
|
|
||||||
|
|
||||||
|
class TokenGraph(DiGraph):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name: str = 'TokenGraph',
|
name: str = 'TokenGraph',
|
||||||
enable_logging: bool = True,
|
enable_logging: bool = True,
|
||||||
incoming_graph_data: Any| None = None,
|
incoming_graph_data: Any | None = None,
|
||||||
**attr,
|
**attr,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(incoming_graph_data, **attr)
|
super().__init__(incoming_graph_data, **attr)
|
||||||
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
|
|||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return (f"TokenGraph(name: {self.name}, number of nodes: "
|
return (
|
||||||
f"{len(self.nodes)}, number of edges: "
|
f'TokenGraph(name: {self.name}, number of nodes: '
|
||||||
f"{len(self.edges)})")
|
f'{len(self.nodes)}, number of edges: '
|
||||||
|
f'{len(self.edges)})'
|
||||||
|
)
|
||||||
|
|
||||||
# !! only used to verify that saving was done correctly
|
# !! only used to verify that saving was done correctly
|
||||||
"""
|
"""
|
||||||
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
|
|||||||
self,
|
self,
|
||||||
inplace: Literal[True] = ...,
|
inplace: Literal[True] = ...,
|
||||||
logging: bool | None = ...,
|
logging: bool | None = ...,
|
||||||
) -> None:
|
) -> None: ...
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def to_undirected(
|
def to_undirected(
|
||||||
self,
|
self,
|
||||||
inplace: Literal[False],
|
inplace: Literal[False],
|
||||||
logging: bool | None = ...,
|
logging: bool | None = ...,
|
||||||
) -> Graph:
|
) -> Graph: ...
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def to_undirected(
|
def to_undirected(
|
||||||
self,
|
self, inplace: bool = ..., logging: bool | None = ...
|
||||||
inplace: bool = ...,
|
) -> Graph | None: ...
|
||||||
logging: bool | None = ...
|
|
||||||
) -> Graph | None:
|
|
||||||
...
|
|
||||||
|
|
||||||
def to_undirected(
|
def to_undirected(
|
||||||
self,
|
self,
|
||||||
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
|
|||||||
if logging is None:
|
if logging is None:
|
||||||
logging = self.logging
|
logging = self.logging
|
||||||
|
|
||||||
self._undirected = convert_graph_to_undirected(graph=self,
|
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
|
||||||
logging=logging)
|
self._metadata_undirected = get_graph_metadata(
|
||||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
graph=self._undirected, logging=logging
|
||||||
logging=logging)
|
)
|
||||||
if not inplace:
|
if not inplace:
|
||||||
return self._undirected
|
return self._undirected
|
||||||
|
|
||||||
@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
|
|||||||
if logging is None:
|
if logging is None:
|
||||||
logging = self.logging
|
logging = self.logging
|
||||||
|
|
||||||
self._metadata_directed = get_graph_metadata(graph=self,
|
self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
|
||||||
logging=logging)
|
|
||||||
if self._undirected is not None:
|
if self._undirected is not None:
|
||||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
self._metadata_undirected = get_graph_metadata(
|
||||||
logging=logging)
|
graph=self._undirected, logging=logging
|
||||||
|
)
|
||||||
|
|
||||||
def filter_by_edge_weight(
|
def filter_by_edge_weight(
|
||||||
self,
|
self,
|
||||||
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
|
|||||||
filtered_graph = self.copy()
|
filtered_graph = self.copy()
|
||||||
|
|
||||||
for edge in original_graph_edges:
|
for edge in original_graph_edges:
|
||||||
weight = typing.cast(int,
|
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
|
||||||
filtered_graph[edge[0]][edge[1]]['weight'])
|
|
||||||
if weight < threshold:
|
if weight < threshold:
|
||||||
filtered_graph.remove_edge(edge[0], edge[1])
|
filtered_graph.remove_edge(edge[0], edge[1])
|
||||||
|
|
||||||
@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
|
|||||||
filename: str | None = None,
|
filename: str | None = None,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
saving_path = path.joinpath(f"{filename}")
|
saving_path = path.joinpath(f'{filename}')
|
||||||
else:
|
else:
|
||||||
saving_path = path.joinpath(f"{self.name}")
|
saving_path = path.joinpath(f'{self.name}')
|
||||||
|
|
||||||
return saving_path
|
return saving_path
|
||||||
|
|
||||||
@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
|
|||||||
elif not directed and self._undirected is not None:
|
elif not directed and self._undirected is not None:
|
||||||
target_graph = self._undirected
|
target_graph = self._undirected
|
||||||
else:
|
else:
|
||||||
raise ValueError("No undirected graph available.")
|
raise ValueError('No undirected graph available.')
|
||||||
|
|
||||||
saving_path = saving_path.with_suffix('.graphml')
|
saving_path = saving_path.with_suffix('.graphml')
|
||||||
nx.write_graphml(G=target_graph, path=saving_path)
|
nx.write_graphml(G=target_graph, path=saving_path)
|
||||||
logger.info(("Successfully saved graph as GraphML file "
|
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
|
||||||
f"under {saving_path}."))
|
|
||||||
|
|
||||||
def to_pickle(
|
def to_pickle(
|
||||||
self,
|
self,
|
||||||
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
|
|||||||
match path.suffix:
|
match path.suffix:
|
||||||
case '.graphml':
|
case '.graphml':
|
||||||
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
||||||
logger.info(f"Successfully loaded graph from GraphML file {path}.")
|
logger.info(f'Successfully loaded graph from GraphML file {path}.')
|
||||||
case '.pkl' | '.pickle':
|
case '.pkl' | '.pickle':
|
||||||
graph = typing.cast(Self, load_pickle(path))
|
graph = typing.cast(Self, load_pickle(path))
|
||||||
logger.info(f"Successfully loaded graph from pickle file {path}.")
|
logger.info(f'Successfully loaded graph from pickle file {path}.')
|
||||||
case _:
|
case _:
|
||||||
raise ValueError("File format not supported.")
|
raise ValueError('File format not supported.')
|
||||||
|
|
||||||
return graph
|
return graph
|
||||||
|
|
||||||
@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
|
|||||||
path = Path(path)
|
path = Path(path)
|
||||||
|
|
||||||
if path.suffix not in ('.pkl', '.pickle'):
|
if path.suffix not in ('.pkl', '.pickle'):
|
||||||
raise ValueError("File format not supported.")
|
raise ValueError('File format not supported.')
|
||||||
|
|
||||||
graph = typing.cast(Self, load_pickle(path))
|
graph = typing.cast(Self, load_pickle(path))
|
||||||
|
|
||||||
|
|||||||
@ -1,29 +1,29 @@
|
|||||||
from typing import cast, Callable
|
import re
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
import re
|
|
||||||
from math import factorial
|
from math import factorial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Callable, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from torch import Tensor
|
|
||||||
from pandas import DataFrame, Series
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from spacy.lang.de import German as GermanSpacyModel
|
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
import sentence_transformers
|
import sentence_transformers
|
||||||
import sentence_transformers.util
|
import sentence_transformers.util
|
||||||
|
from pandas import DataFrame, Series
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from spacy.lang.de import German as GermanSpacyModel
|
||||||
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
|
from torch import Tensor
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from lang_main.types import Embedding, PandasIndex
|
|
||||||
from lang_main.loggers import logger_preprocess as logger
|
|
||||||
from lang_main.pipelines.base import BasePipeline
|
|
||||||
from lang_main.analysis.shared import (
|
from lang_main.analysis.shared import (
|
||||||
|
candidates_by_index,
|
||||||
similar_index_connection_graph,
|
similar_index_connection_graph,
|
||||||
similar_index_groups,
|
similar_index_groups,
|
||||||
)
|
)
|
||||||
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
from lang_main.loggers import logger_preprocess as logger
|
||||||
|
from lang_main.pipelines.base import BasePipeline
|
||||||
|
from lang_main.types import Embedding, PandasIndex
|
||||||
|
|
||||||
|
|
||||||
# ** (1) dataset preparation: loading and simple preprocessing
|
# ** (1) dataset preparation: loading and simple preprocessing
|
||||||
@ -67,11 +67,16 @@ def load_raw_data(
|
|||||||
parse_dates=date_cols,
|
parse_dates=date_cols,
|
||||||
dayfirst=True,
|
dayfirst=True,
|
||||||
)
|
)
|
||||||
logger.info("Loaded dataset successfully.")
|
logger.info('Loaded dataset successfully.')
|
||||||
logger.info((f"Dataset properties: number of entries: {len(data)}, "
|
logger.info(
|
||||||
f"number of features {len(data.columns)}"))
|
(
|
||||||
|
f'Dataset properties: number of entries: {len(data)}, '
|
||||||
|
f'number of features {len(data.columns)}'
|
||||||
|
)
|
||||||
|
)
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
def remove_duplicates(
|
def remove_duplicates(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
@ -89,7 +94,7 @@ def remove_duplicates(
|
|||||||
"""
|
"""
|
||||||
# obtain info about duplicates over all features
|
# obtain info about duplicates over all features
|
||||||
duplicates_filt = data.duplicated()
|
duplicates_filt = data.duplicated()
|
||||||
logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
|
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
|
||||||
# drop duplicates
|
# drop duplicates
|
||||||
wo_duplicates = data.drop_duplicates(ignore_index=True)
|
wo_duplicates = data.drop_duplicates(ignore_index=True)
|
||||||
duplicates_subset: list[str] = [
|
duplicates_subset: list[str] = [
|
||||||
@ -97,16 +102,26 @@ def remove_duplicates(
|
|||||||
'ObjektID',
|
'ObjektID',
|
||||||
]
|
]
|
||||||
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
|
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
|
||||||
logger.info(("Number of duplicates over subset "
|
logger.info(
|
||||||
f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
|
(
|
||||||
wo_duplicates =\
|
'Number of duplicates over subset '
|
||||||
wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
|
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
|
||||||
logger.info("Removed all duplicates from dataset successfully.")
|
)
|
||||||
logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
|
)
|
||||||
f"number of features {len(wo_duplicates.columns)}"))
|
wo_duplicates = wo_duplicates.drop_duplicates(
|
||||||
|
subset=duplicates_subset, ignore_index=True
|
||||||
|
).copy()
|
||||||
|
logger.info('Removed all duplicates from dataset successfully.')
|
||||||
|
logger.info(
|
||||||
|
(
|
||||||
|
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
|
||||||
|
f'number of features {len(wo_duplicates.columns)}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return (wo_duplicates,)
|
return (wo_duplicates,)
|
||||||
|
|
||||||
|
|
||||||
def remove_NA(
|
def remove_NA(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
target_features: list[str] = [
|
target_features: list[str] = [
|
||||||
@ -128,15 +143,16 @@ def remove_NA(
|
|||||||
dataset with removed NA entries for given subset of features
|
dataset with removed NA entries for given subset of features
|
||||||
"""
|
"""
|
||||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||||
logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
|
logger.info(
|
||||||
|
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
||||||
|
)
|
||||||
|
|
||||||
return (wo_NA,)
|
return (wo_NA,)
|
||||||
|
|
||||||
|
|
||||||
# ** (2) entry-based cleansing
|
# ** (2) entry-based cleansing
|
||||||
# following functions clean and prepare specific entries, not whole dataset
|
# following functions clean and prepare specific entries, not whole dataset
|
||||||
def clean_string_slim(
|
def clean_string_slim(string: str) -> str:
|
||||||
string: str
|
|
||||||
) -> str:
|
|
||||||
"""mapping function to clean single string entries in a series (feature-wise)
|
"""mapping function to clean single string entries in a series (feature-wise)
|
||||||
of the dataset, used to be applied element-wise for string features
|
of the dataset, used to be applied element-wise for string features
|
||||||
|
|
||||||
@ -151,13 +167,16 @@ def clean_string_slim(
|
|||||||
cleaned entry
|
cleaned entry
|
||||||
"""
|
"""
|
||||||
# remove special chars
|
# remove special chars
|
||||||
pattern = r'[\t\n\r\f\v]'
|
pattern = r'[\t\n\r\f\v]+'
|
||||||
string = re.sub(pattern, ' ', string)
|
string = re.sub(pattern, ' ', string)
|
||||||
|
pattern = r'([,;.:!?-_\+]){2,}'
|
||||||
# remove whitespaces at the beginning and the end
|
# remove whitespaces at the beginning and the end
|
||||||
|
string = re.sub(pattern, r'\1', string)
|
||||||
string = string.strip()
|
string = string.strip()
|
||||||
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
def entry_wise_cleansing(
|
def entry_wise_cleansing(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
target_feature: str,
|
target_feature: str,
|
||||||
@ -165,10 +184,16 @@ def entry_wise_cleansing(
|
|||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
# apply given cleansing function to target feature
|
# apply given cleansing function to target feature
|
||||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||||
logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
|
logger.info(
|
||||||
f"for feature >>{target_feature}<<"))
|
(
|
||||||
|
f'Successfully applied entry-wise cleansing procedure '
|
||||||
|
f'>>{cleansing_func.__name__}<< '
|
||||||
|
f'for feature >>{target_feature}<<'
|
||||||
|
)
|
||||||
|
)
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
# ** in-depth analysis of one feature
|
# ** in-depth analysis of one feature
|
||||||
# following functions try to gain insights on a given feature of the IHM dataset such
|
# following functions try to gain insights on a given feature of the IHM dataset such
|
||||||
# as number of occurrences or associated Object IDs
|
# as number of occurrences or associated Object IDs
|
||||||
@ -178,7 +203,7 @@ def analyse_feature(
|
|||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
# feature columns
|
# feature columns
|
||||||
feature_entries = data[target_feature]
|
feature_entries = data[target_feature]
|
||||||
logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
|
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
|
||||||
# obtain unique entries
|
# obtain unique entries
|
||||||
unique_feature_entries = feature_entries.unique()
|
unique_feature_entries = feature_entries.unique()
|
||||||
|
|
||||||
@ -186,7 +211,7 @@ def analyse_feature(
|
|||||||
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
||||||
result_df = pd.DataFrame(columns=cols)
|
result_df = pd.DataFrame(columns=cols)
|
||||||
|
|
||||||
for entry in tqdm(unique_feature_entries, mininterval=1.):
|
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
||||||
len_entry = len(entry)
|
len_entry = len(entry)
|
||||||
filt = data[target_feature] == entry
|
filt = data[target_feature] == entry
|
||||||
temp = data[filt]
|
temp = data[filt]
|
||||||
@ -195,13 +220,10 @@ def analyse_feature(
|
|||||||
num_assoc_obj_ids = len(assoc_obj_ids)
|
num_assoc_obj_ids = len(assoc_obj_ids)
|
||||||
num_dupl = filt.sum()
|
num_dupl = filt.sum()
|
||||||
|
|
||||||
conc_df = pd.DataFrame(data=[[
|
conc_df = pd.DataFrame(
|
||||||
entry,
|
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
|
||||||
len_entry,
|
columns=cols,
|
||||||
num_dupl,
|
)
|
||||||
assoc_obj_ids,
|
|
||||||
num_assoc_obj_ids
|
|
||||||
]], columns=cols)
|
|
||||||
|
|
||||||
result_df = pd.concat([result_df, conc_df], ignore_index=True)
|
result_df = pd.concat([result_df, conc_df], ignore_index=True)
|
||||||
|
|
||||||
@ -230,9 +252,9 @@ def build_embedding_map(
|
|||||||
is_STRF = True
|
is_STRF = True
|
||||||
|
|
||||||
if not any((is_spacy, is_STRF)):
|
if not any((is_spacy, is_STRF)):
|
||||||
raise NotImplementedError("Model type unknown")
|
raise NotImplementedError('Model type unknown')
|
||||||
|
|
||||||
for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
|
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
||||||
# verbose code: Pyright not inferring types correctly
|
# verbose code: Pyright not inferring types correctly
|
||||||
idx = cast(int, idx)
|
idx = cast(int, idx)
|
||||||
text = cast(str, text)
|
text = cast(str, text)
|
||||||
@ -246,12 +268,17 @@ def build_embedding_map(
|
|||||||
logger.debug(f'{embd.text=} has no vector')
|
logger.debug(f'{embd.text=} has no vector')
|
||||||
elif is_STRF:
|
elif is_STRF:
|
||||||
model = cast(SentenceTransformer, model)
|
model = cast(SentenceTransformer, model)
|
||||||
embd = cast(Tensor,
|
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||||
model.encode(text, show_progress_bar=False))
|
|
||||||
embeddings[idx] = (embd, text)
|
embeddings[idx] = (embd, text)
|
||||||
|
|
||||||
return embeddings, (is_spacy, is_STRF)
|
return embeddings, (is_spacy, is_STRF)
|
||||||
|
|
||||||
|
|
||||||
|
# adapt interface
|
||||||
|
# use candidates by index function
|
||||||
|
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
|
||||||
|
|
||||||
|
|
||||||
# build similarity matrix out of embeddings
|
# build similarity matrix out of embeddings
|
||||||
def build_cosSim_matrix(
|
def build_cosSim_matrix(
|
||||||
data: Series,
|
data: Series,
|
||||||
@ -259,10 +286,11 @@ def build_cosSim_matrix(
|
|||||||
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
||||||
# build empty matrix
|
# build empty matrix
|
||||||
df_index = data.index
|
df_index = data.index
|
||||||
cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index,
|
cosineSim_idx_matrix = pd.DataFrame(
|
||||||
index=df_index, dtype=np.float32)
|
data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Start building embedding map...")
|
logger.info('Start building embedding map...')
|
||||||
|
|
||||||
# obtain embeddings based on used model
|
# obtain embeddings based on used model
|
||||||
embds, (is_spacy, is_STRF) = build_embedding_map(
|
embds, (is_spacy, is_STRF) = build_embedding_map(
|
||||||
@ -270,16 +298,16 @@ def build_cosSim_matrix(
|
|||||||
model=model,
|
model=model,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Embedding map built successfully.")
|
logger.info('Embedding map built successfully.')
|
||||||
|
|
||||||
# apply index based mapping for efficient handling of large texts
|
# apply index based mapping for efficient handling of large texts
|
||||||
combs = combinations(df_index, 2)
|
combs = combinations(df_index, 2)
|
||||||
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index)-2)
|
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
||||||
|
|
||||||
logger.info("Start calculation of similarity scores...")
|
logger.info('Start calculation of similarity scores...')
|
||||||
|
|
||||||
for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
|
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
||||||
#print(f"{idx1=}, {idx2=}")
|
# print(f"{idx1=}, {idx2=}")
|
||||||
embd1 = embds[idx1][0]
|
embd1 = embds[idx1][0]
|
||||||
embd2 = embds[idx2][0]
|
embd2 = embds[idx2][0]
|
||||||
|
|
||||||
@ -296,10 +324,11 @@ def build_cosSim_matrix(
|
|||||||
|
|
||||||
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
||||||
|
|
||||||
logger.info("Similarity scores calculated successfully.")
|
logger.info('Similarity scores calculated successfully.')
|
||||||
|
|
||||||
return cosineSim_idx_matrix, embds
|
return cosineSim_idx_matrix, embds
|
||||||
|
|
||||||
|
|
||||||
# obtain index pairs with cosine similarity
|
# obtain index pairs with cosine similarity
|
||||||
# greater than or equal to given threshold value
|
# greater than or equal to given threshold value
|
||||||
def filt_thresh_cosSim_matrix(
|
def filt_thresh_cosSim_matrix(
|
||||||
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
|
|||||||
Series
|
Series
|
||||||
series with multi index (index pairs) and corresponding similarity score
|
series with multi index (index pairs) and corresponding similarity score
|
||||||
"""
|
"""
|
||||||
cosineSim_filt = cast(Series,
|
cosineSim_filt = cast(
|
||||||
cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
|
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
||||||
|
)
|
||||||
|
|
||||||
return cosineSim_filt, embds
|
return cosineSim_filt, embds
|
||||||
|
|
||||||
|
|
||||||
def list_cosSim_dupl_candidates(
|
def list_cosSim_dupl_candidates(
|
||||||
cosineSim_filt: Series,
|
cosineSim_filt: Series,
|
||||||
embds: dict[int, tuple[Embedding, str]],
|
embds: dict[int, tuple[Embedding, str]],
|
||||||
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
|
|||||||
list containing relevant index pairs for entries with similarity score greater than
|
list containing relevant index pairs for entries with similarity score greater than
|
||||||
given threshold
|
given threshold
|
||||||
"""
|
"""
|
||||||
logger.info("Start gathering of similarity candidates...")
|
logger.info('Start gathering of similarity candidates...')
|
||||||
# compare found duplicates
|
# compare found duplicates
|
||||||
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||||
df_candidates = pd.DataFrame(columns=columns)
|
df_candidates = pd.DataFrame(columns=columns)
|
||||||
|
|
||||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||||
|
|
||||||
for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||||
# get text content from embedding as second tuple entry
|
# get text content from embedding as second tuple entry
|
||||||
content = [[
|
content = [
|
||||||
|
[
|
||||||
idx1,
|
idx1,
|
||||||
embds[idx1][1],
|
embds[idx1][1],
|
||||||
idx2,
|
idx2,
|
||||||
embds[idx2][1],
|
embds[idx2][1],
|
||||||
score,
|
score,
|
||||||
]]
|
]
|
||||||
|
]
|
||||||
# add candidates to collection DataFrame
|
# add candidates to collection DataFrame
|
||||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||||
if df_candidates.empty:
|
if df_candidates.empty:
|
||||||
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
|
|||||||
# save index pairs
|
# save index pairs
|
||||||
index_pairs.append((idx1, idx2))
|
index_pairs.append((idx1, idx2))
|
||||||
|
|
||||||
logger.info("Similarity candidates gathered successfully.")
|
logger.info('Similarity candidates gathered successfully.')
|
||||||
|
|
||||||
if save_candidates:
|
if save_candidates:
|
||||||
if saving_path is None:
|
if saving_path is None:
|
||||||
raise ValueError(("Saving path must be provided if duplicate "
|
raise ValueError(
|
||||||
"candidates should be saved."))
|
('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
||||||
|
)
|
||||||
elif pipeline is not None:
|
elif pipeline is not None:
|
||||||
target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_'
|
target_filename = (
|
||||||
+ filename + '.xlsx')
|
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
||||||
|
)
|
||||||
elif pipeline is None:
|
elif pipeline is None:
|
||||||
target_filename = f'{filename}.xlsx'
|
target_filename = f'{filename}.xlsx'
|
||||||
logger.info("Saving similarity candidates...")
|
logger.info('Saving similarity candidates...')
|
||||||
target_path = saving_path.joinpath(target_filename)
|
target_path = saving_path.joinpath(target_filename)
|
||||||
df_candidates.to_excel(target_path)
|
df_candidates.to_excel(target_path)
|
||||||
logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
|
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
|
||||||
|
|
||||||
return index_pairs, embds
|
return index_pairs, embds
|
||||||
|
|
||||||
|
|
||||||
# TODO: change implementation fully to SentenceTransformer
|
# TODO: change implementation fully to SentenceTransformer
|
||||||
# usage of batch processing for embeddings, use candidate idx function
|
# usage of batch processing for embeddings, use candidate idx function
|
||||||
# from time analysis --> moved to ``helpers.py``
|
# from time analysis --> moved to ``helpers.py``
|
||||||
@ -419,20 +455,28 @@ def similar_ids_groups(
|
|||||||
yield list(id_group)
|
yield list(id_group)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def merge_similarity_dupl(
|
def merge_similarity_dupl(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
model: SentenceTransformer,
|
||||||
|
cos_sim_threshold: float,
|
||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
logger.info("Start merging of similarity candidates...")
|
logger.info('Start merging of similarity candidates...')
|
||||||
|
|
||||||
# data
|
# data
|
||||||
merged_data = data.copy()
|
merged_data = data.copy()
|
||||||
|
model_input = merged_data['entry']
|
||||||
|
candidates_idx = candidates_by_index(
|
||||||
|
data_model_input=model_input,
|
||||||
|
model=model,
|
||||||
|
cos_sim_threshold=cos_sim_threshold,
|
||||||
|
)
|
||||||
# graph of similar ids
|
# graph of similar ids
|
||||||
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
|
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
||||||
|
|
||||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||||
similar_id_group = list(similar_id_group)
|
similar_id_group = list(similar_id_group)
|
||||||
similar_data = merged_data.loc[similar_id_group,:]
|
similar_data = merged_data.loc[similar_id_group, :]
|
||||||
# keep first entry with max number occurrences, then number of
|
# keep first entry with max number occurrences, then number of
|
||||||
# associated objects, then length of entry
|
# associated objects, then length of entry
|
||||||
similar_data = similar_data.sort_values(
|
similar_data = similar_data.sort_values(
|
||||||
@ -454,10 +498,11 @@ def merge_similarity_dupl(
|
|||||||
merged_data.update(merged_similar_data)
|
merged_data.update(merged_similar_data)
|
||||||
merged_data = merged_data.drop(index=similar_id_group)
|
merged_data = merged_data.drop(index=similar_id_group)
|
||||||
|
|
||||||
logger.info("Similarity candidates merged successfully.")
|
logger.info('Similarity candidates merged successfully.')
|
||||||
|
|
||||||
return (merged_data.copy(),)
|
return (merged_data.copy(),)
|
||||||
|
|
||||||
|
|
||||||
# merge duplicates
|
# merge duplicates
|
||||||
def merge_similarity_dupl_old(
|
def merge_similarity_dupl_old(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
@ -466,11 +511,10 @@ def merge_similarity_dupl_old(
|
|||||||
# copy pre-cleaned data
|
# copy pre-cleaned data
|
||||||
temp = data.copy()
|
temp = data.copy()
|
||||||
index = temp.index
|
index = temp.index
|
||||||
#logger.info("Start merging of similarity candidates...")
|
# logger.info("Start merging of similarity candidates...")
|
||||||
|
|
||||||
# iterate over index pairs
|
# iterate over index pairs
|
||||||
for (i1, i2) in tqdm(dupl_idx_pairs):
|
for i1, i2 in tqdm(dupl_idx_pairs):
|
||||||
|
|
||||||
# if an entry does not exist any more, skip this pair
|
# if an entry does not exist any more, skip this pair
|
||||||
if i1 not in index or i2 not in index:
|
if i1 not in index or i2 not in index:
|
||||||
continue
|
continue
|
||||||
@ -498,7 +542,7 @@ def merge_similarity_dupl_old(
|
|||||||
temp = temp.drop(index=i2)
|
temp = temp.drop(index=i2)
|
||||||
index = temp.index
|
index = temp.index
|
||||||
|
|
||||||
#logger.info("Similarity candidates merged successfully.")
|
# logger.info("Similarity candidates merged successfully.")
|
||||||
|
|
||||||
return (temp,)
|
return (temp,)
|
||||||
|
|
||||||
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
|
|||||||
given threshold
|
given threshold
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# compare found duplicates
|
# compare found duplicates
|
||||||
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||||
df_candidates = pd.DataFrame(columns=columns)
|
df_candidates = pd.DataFrame(columns=columns)
|
||||||
|
|
||||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||||
|
|
||||||
for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
|
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
||||||
# get texts for comparison
|
# get texts for comparison
|
||||||
text1 = embds[idx1][1]
|
text1 = embds[idx1][1]
|
||||||
text2 = embds[idx2][1]
|
text2 = embds[idx2][1]
|
||||||
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# get text content from embedding as second tuple entry
|
# get text content from embedding as second tuple entry
|
||||||
content = [[
|
content = [
|
||||||
|
[
|
||||||
idx1,
|
idx1,
|
||||||
text1,
|
text1,
|
||||||
idx2,
|
idx2,
|
||||||
text2,
|
text2,
|
||||||
score,
|
score,
|
||||||
]]
|
]
|
||||||
|
]
|
||||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||||
|
|
||||||
df_candidates = pd.concat([df_candidates, df_conc])
|
df_candidates = pd.concat([df_candidates, df_conc])
|
||||||
|
|||||||
@ -1,11 +1,71 @@
|
|||||||
from typing import cast
|
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
|
import sentence_transformers
|
||||||
|
import sentence_transformers.util
|
||||||
from networkx import Graph
|
from networkx import Graph
|
||||||
|
from pandas import Series
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from torch import Tensor
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||||
from lang_main.types import PandasIndex
|
from lang_main.types import PandasIndex
|
||||||
from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
|
||||||
|
|
||||||
|
def candidates_by_index(
|
||||||
|
data_model_input: Series,
|
||||||
|
model: SentenceTransformer,
|
||||||
|
cos_sim_threshold: float = 0.5,
|
||||||
|
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||||
|
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||||
|
"""function to filter candidate indices based on cosine similarity
|
||||||
|
using SentenceTransformer model in batch mode,
|
||||||
|
feed data as Series to retain information about indices of entries and
|
||||||
|
access them later in the original dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
obj_id : ObjectID
|
||||||
|
_description_
|
||||||
|
data_model_input : Series
|
||||||
|
containing indices and text entries to process
|
||||||
|
model : SentenceTransformer
|
||||||
|
necessary SentenceTransformer model to encode text entries
|
||||||
|
cos_sim_threshold : float, optional
|
||||||
|
threshold for cosine similarity to filter candidates, by default 0.5
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
||||||
|
ObjectID and tuple of index pairs which meet the cosine
|
||||||
|
similarity threshold
|
||||||
|
"""
|
||||||
|
# embeddings
|
||||||
|
batch = cast(list[str], data_model_input.to_list())
|
||||||
|
embds = cast(
|
||||||
|
Tensor,
|
||||||
|
model.encode(
|
||||||
|
batch,
|
||||||
|
convert_to_numpy=False,
|
||||||
|
convert_to_tensor=True,
|
||||||
|
show_progress_bar=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# cosine similarity
|
||||||
|
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||||
|
np.fill_diagonal(cos_sim, 0.0)
|
||||||
|
cos_sim = np.triu(cos_sim)
|
||||||
|
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||||
|
|
||||||
|
for idx_array in cos_sim_idx:
|
||||||
|
idx_pair = cast(
|
||||||
|
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
|
||||||
|
)
|
||||||
|
yield idx_pair
|
||||||
|
|
||||||
|
|
||||||
def similar_index_connection_graph(
|
def similar_index_connection_graph(
|
||||||
@ -15,21 +75,21 @@ def similar_index_connection_graph(
|
|||||||
# use this graph to get connected components (indices which belong together)
|
# use this graph to get connected components (indices which belong together)
|
||||||
# retain semantic connection on whole dataset
|
# retain semantic connection on whole dataset
|
||||||
similar_id_graph = nx.Graph()
|
similar_id_graph = nx.Graph()
|
||||||
for (idx1, idx2) in similar_idx_pairs:
|
# for idx1, idx2 in similar_idx_pairs:
|
||||||
# inplace operation, parent/child do not really exist in undirected graph
|
# # inplace operation, parent/child do not really exist in undirected graph
|
||||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||||
|
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
|
||||||
|
|
||||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
||||||
|
|
||||||
return similar_id_graph, graph_info
|
return similar_id_graph, graph_info
|
||||||
|
|
||||||
# TODO check returning tuple
|
|
||||||
def similar_index_groups(
|
def similar_index_groups(
|
||||||
similar_id_graph: Graph,
|
similar_id_graph: Graph,
|
||||||
) -> Iterator[tuple[PandasIndex, ...]]:
|
) -> Iterator[tuple[PandasIndex, ...]]:
|
||||||
# groups of connected indices
|
# groups of connected indices
|
||||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
|
||||||
nx.connected_components(G=similar_id_graph))
|
|
||||||
|
|
||||||
for id_group in ids_groups:
|
for id_group in ids_groups:
|
||||||
yield tuple(id_group)
|
yield tuple(id_group)
|
||||||
@ -1,21 +1,17 @@
|
|||||||
from typing import cast
|
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import numpy.typing as npt
|
|
||||||
from pandas import DataFrame, Series
|
from pandas import DataFrame, Series
|
||||||
from torch import Tensor
|
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
import sentence_transformers
|
|
||||||
import sentence_transformers.util
|
|
||||||
from tqdm.auto import tqdm # TODO: check deletion
|
from tqdm.auto import tqdm # TODO: check deletion
|
||||||
|
|
||||||
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
|
|
||||||
from lang_main.loggers import logger_timeline as logger
|
|
||||||
from lang_main.analysis.shared import (
|
from lang_main.analysis.shared import (
|
||||||
|
candidates_by_index,
|
||||||
similar_index_connection_graph,
|
similar_index_connection_graph,
|
||||||
similar_index_groups,
|
similar_index_groups,
|
||||||
)
|
)
|
||||||
|
from lang_main.loggers import logger_timeline as logger
|
||||||
|
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
|
||||||
|
|
||||||
|
|
||||||
def non_relevant_obj_ids(
|
def non_relevant_obj_ids(
|
||||||
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
|
|||||||
feature_uniqueness: str = 'HObjektText',
|
feature_uniqueness: str = 'HObjektText',
|
||||||
feature_obj_id: str = 'ObjektID',
|
feature_obj_id: str = 'ObjektID',
|
||||||
) -> tuple[ObjectID, ...]:
|
) -> tuple[ObjectID, ...]:
|
||||||
|
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
ids_to_ignore: set[ObjectID] = set()
|
ids_to_ignore: set[ObjectID] = set()
|
||||||
obj_ids = cast(Iterable[ObjectID], # actually NumPy array
|
obj_ids = cast(
|
||||||
data[feature_obj_id].unique())
|
Iterable[ObjectID], # actually NumPy array
|
||||||
|
data[feature_obj_id].unique(),
|
||||||
|
)
|
||||||
|
|
||||||
for obj_id in obj_ids:
|
for obj_id in obj_ids:
|
||||||
feats_per_obj_id = cast(
|
feats_per_obj_id = cast(
|
||||||
Series,
|
Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
|
||||||
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
|
|
||||||
)
|
)
|
||||||
# check for uniqueness of given feature for current ObjectID
|
# check for uniqueness of given feature for current ObjectID
|
||||||
# ignore NaN values
|
# ignore NaN values
|
||||||
@ -46,14 +42,15 @@ def non_relevant_obj_ids(
|
|||||||
|
|
||||||
return tuple(ids_to_ignore)
|
return tuple(ids_to_ignore)
|
||||||
|
|
||||||
|
|
||||||
def remove_non_relevant_obj_ids(
|
def remove_non_relevant_obj_ids(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
thresh_unique_feat_per_id: int,
|
thresh_unique_feat_per_id: int,
|
||||||
*,
|
*,
|
||||||
feature_uniqueness: str = 'HObjektText',
|
feature_uniqueness: str = 'HObjektText',
|
||||||
feature_obj_id: str = 'ObjektID',
|
feature_obj_id: str = 'ObjektID',
|
||||||
) -> DataFrame:
|
) -> tuple[DataFrame]:
|
||||||
logger.info("Removing non-relevant ObjectIDs from dataset")
|
logger.info('Removing non-relevant ObjectIDs from dataset')
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
ids_to_ignore = non_relevant_obj_ids(
|
ids_to_ignore = non_relevant_obj_ids(
|
||||||
data=data,
|
data=data,
|
||||||
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
|
|||||||
)
|
)
|
||||||
# only retain entries with ObjectIDs not in IDs to ignore
|
# only retain entries with ObjectIDs not in IDs to ignore
|
||||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||||
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
|
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
|
||||||
logger.info("Non-relevant ObjectIDs removed successfully")
|
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||||
|
|
||||||
return data
|
return (data,)
|
||||||
|
|
||||||
def filter_activities_per_obj_id(
|
|
||||||
data: DataFrame,
|
|
||||||
activity_feature: str = 'VorgangsTypName',
|
|
||||||
relevant_activity_types: Iterable[str] = (
|
|
||||||
'Reparaturauftrag (Portal)',
|
|
||||||
),
|
|
||||||
feature_obj_id: str = 'ObjektID',
|
|
||||||
threshold_num_activities: int = 1,
|
|
||||||
) -> tuple[DataFrame, Series]:
|
|
||||||
data = data.copy()
|
|
||||||
# filter only relevant activities count occurrences for each ObjectID
|
|
||||||
logger.info("Filtering activities per ObjectID")
|
|
||||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
|
||||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
|
||||||
num_activities_per_obj_id = cast(
|
|
||||||
Series,
|
|
||||||
data_filter_activities[feature_obj_id].value_counts(sort=True)
|
|
||||||
)
|
|
||||||
# filter for ObjectIDs with more than given number of activities
|
|
||||||
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
|
|
||||||
# index of series contains ObjectIDs
|
|
||||||
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
|
||||||
filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
|
|
||||||
.isin(obj_ids_below_thresh))
|
|
||||||
|
|
||||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
|
||||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
|
||||||
logger.info("Activities per ObjectID filtered successfully")
|
|
||||||
|
|
||||||
return data_filter_activities, num_activities_per_obj_id
|
|
||||||
|
|
||||||
def generate_model_input(
|
def generate_model_input(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
@ -107,8 +74,8 @@ def generate_model_input(
|
|||||||
'VorgangsArtText',
|
'VorgangsArtText',
|
||||||
'VorgangsBeschreibung',
|
'VorgangsBeschreibung',
|
||||||
),
|
),
|
||||||
) -> DataFrame:
|
) -> tuple[DataFrame]:
|
||||||
logger.info("Generating concatenation of model input features")
|
logger.info('Generating concatenation of model input features')
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
model_input_features = list(model_input_features)
|
model_input_features = list(model_input_features)
|
||||||
input_features = data[model_input_features].fillna('').astype(str)
|
input_features = data[model_input_features].fillna('').astype(str)
|
||||||
@ -116,9 +83,40 @@ def generate_model_input(
|
|||||||
lambda x: ' - '.join(x),
|
lambda x: ' - '.join(x),
|
||||||
axis=1,
|
axis=1,
|
||||||
)
|
)
|
||||||
logger.info("Model input generated successfully")
|
logger.info('Model input generated successfully')
|
||||||
|
|
||||||
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_activities_per_obj_id(
|
||||||
|
data: DataFrame,
|
||||||
|
activity_feature: str = 'VorgangsTypName',
|
||||||
|
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
|
||||||
|
feature_obj_id: str = 'ObjektID',
|
||||||
|
threshold_num_activities: int = 1,
|
||||||
|
) -> tuple[DataFrame, Series]:
|
||||||
|
data = data.copy()
|
||||||
|
# filter only relevant activities count occurrences for each ObjectID
|
||||||
|
logger.info('Filtering activities per ObjectID')
|
||||||
|
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||||
|
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||||
|
num_activities_per_obj_id = cast(
|
||||||
|
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||||
|
)
|
||||||
|
# filter for ObjectIDs with more than given number of activities
|
||||||
|
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
|
||||||
|
# index of series contains ObjectIDs
|
||||||
|
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
||||||
|
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
|
||||||
|
obj_ids_below_thresh
|
||||||
|
)
|
||||||
|
|
||||||
|
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||||
|
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||||
|
logger.info('Activities per ObjectID filtered successfully')
|
||||||
|
|
||||||
|
return data_filter_activities, num_activities_per_obj_id
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
# for each obj_id in relevant_obj_ids
|
# for each obj_id in relevant_obj_ids
|
||||||
## filter data for obj_id
|
## filter data for obj_id
|
||||||
@ -130,6 +128,7 @@ def generate_model_input(
|
|||||||
## obtain idx pairs, yield
|
## obtain idx pairs, yield
|
||||||
## use idx pairs to get idx values of series
|
## use idx pairs to get idx values of series
|
||||||
|
|
||||||
|
|
||||||
def get_timeline_candidates_index(
|
def get_timeline_candidates_index(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
num_activities_per_obj_id: Series,
|
num_activities_per_obj_id: Series,
|
||||||
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
|
|||||||
model_input_feature: str = 'nlp_model_input',
|
model_input_feature: str = 'nlp_model_input',
|
||||||
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
||||||
# already sorted ObjIDs (descending regarding number of activities)
|
# already sorted ObjIDs (descending regarding number of activities)
|
||||||
obj_ids = cast(Iterable[ObjectID],
|
obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
|
||||||
num_activities_per_obj_id.index)
|
|
||||||
|
|
||||||
for obj_id in tqdm(obj_ids):
|
for obj_id in tqdm(obj_ids):
|
||||||
data_per_obj_id = cast(
|
data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
|
||||||
DataFrame,
|
|
||||||
data.loc[data[feature_obj_id]==obj_id]
|
|
||||||
)
|
|
||||||
data_model_input = data_per_obj_id[model_input_feature]
|
data_model_input = data_per_obj_id[model_input_feature]
|
||||||
|
|
||||||
candidates_idx = candidates_by_index(
|
candidates_idx = candidates_by_index(
|
||||||
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
|
|||||||
cos_sim_threshold=cos_sim_threshold,
|
cos_sim_threshold=cos_sim_threshold,
|
||||||
)
|
)
|
||||||
# directly process candidates
|
# directly process candidates
|
||||||
candidates_idx = tuple(candidates_idx)
|
# candidates_idx = tuple(candidates_idx)
|
||||||
similar_id_graph, _ = similar_index_connection_graph(
|
similar_id_graph, _ = similar_index_connection_graph(
|
||||||
similar_idx_pairs=candidates_idx,
|
similar_idx_pairs=candidates_idx,
|
||||||
)
|
)
|
||||||
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
|
|||||||
for index_group in similar_index_groups(similar_id_graph):
|
for index_group in similar_index_groups(similar_id_graph):
|
||||||
yield obj_id, index_group
|
yield obj_id, index_group
|
||||||
|
|
||||||
|
|
||||||
# TODO: check application for duplicate removal
|
# TODO: check application for duplicate removal
|
||||||
def candidates_by_index(
|
|
||||||
data_model_input: Series,
|
|
||||||
model: SentenceTransformer,
|
|
||||||
cos_sim_threshold: float = 0.5,
|
|
||||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
|
||||||
"""function to filter candidate indices based on cosine similarity
|
|
||||||
using SentenceTransformer model in batch mode,
|
|
||||||
feed data as Series to retain information about indices of entries and
|
|
||||||
access them later in the original dataset
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
obj_id : ObjectID
|
|
||||||
_description_
|
|
||||||
data_model_input : Series
|
|
||||||
containing indices and text entries to process
|
|
||||||
model : SentenceTransformer
|
|
||||||
necessary SentenceTransformer model to encode text entries
|
|
||||||
cos_sim_threshold : float, optional
|
|
||||||
threshold for cosine similarity to filter candidates, by default 0.5
|
|
||||||
|
|
||||||
Yields
|
|
||||||
------
|
|
||||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
|
||||||
ObjectID and tuple of index pairs which meet the cosine
|
|
||||||
similarity threshold
|
|
||||||
"""
|
|
||||||
# embeddings
|
|
||||||
batch = cast(list[str],
|
|
||||||
data_model_input.to_list())
|
|
||||||
embds = cast(
|
|
||||||
Tensor,
|
|
||||||
model.encode(
|
|
||||||
batch,
|
|
||||||
convert_to_numpy=False,
|
|
||||||
convert_to_tensor=True,
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# cosine similarity
|
|
||||||
cos_sim = cast(
|
|
||||||
npt.NDArray,
|
|
||||||
sentence_transformers.util.cos_sim(embds, embds).numpy()
|
|
||||||
)
|
|
||||||
np.fill_diagonal(cos_sim, 0.)
|
|
||||||
cos_sim = np.triu(cos_sim)
|
|
||||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
|
||||||
|
|
||||||
for idx_array in cos_sim_idx:
|
|
||||||
idx_pair = cast(
|
|
||||||
tuple[np.int64, np.int64],
|
|
||||||
tuple(data_model_input.index[idx] for idx in idx_array)
|
|
||||||
)
|
|
||||||
yield idx_pair
|
|
||||||
|
|
||||||
|
|
||||||
def transform_timeline_candidates(
|
def transform_timeline_candidates(
|
||||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||||
) -> TimelineCandidates:
|
) -> TimelineCandidates:
|
||||||
@ -259,20 +199,52 @@ def transform_timeline_candidates(
|
|||||||
|
|
||||||
return candidates_by_obj_id
|
return candidates_by_obj_id
|
||||||
|
|
||||||
def map_obj_texts(
|
|
||||||
|
def map_obj_id_to_texts(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
obj_ids: Iterable[ObjectID],
|
feature_obj_id: str = 'ObjektID',
|
||||||
) -> dict[ObjectID, str]:
|
) -> dict[ObjectID, str]:
|
||||||
|
data = data.copy()
|
||||||
|
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
|
||||||
|
|
||||||
obj_id_to_text: dict[ObjectID, str] = {}
|
obj_id_to_text: dict[ObjectID, str] = {}
|
||||||
|
|
||||||
for obj_id in obj_ids:
|
for obj_id in tqdm(obj_ids):
|
||||||
data_per_obj = cast(
|
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
|
||||||
DataFrame,
|
|
||||||
data.loc[data['ObjektID']==obj_id]
|
|
||||||
)
|
|
||||||
# just take first entry
|
# just take first entry
|
||||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||||
obj_text = obj_text.strip(r' ,.:')
|
obj_text = obj_text.strip(r' ,.:')
|
||||||
obj_id_to_text[obj_id] = obj_text
|
obj_id_to_text[obj_id] = obj_text
|
||||||
|
|
||||||
return obj_id_to_text
|
return obj_id_to_text
|
||||||
|
|
||||||
|
|
||||||
|
def get_timeline_candidates(
|
||||||
|
data: DataFrame,
|
||||||
|
num_activities_per_obj_id: Series,
|
||||||
|
*,
|
||||||
|
model: SentenceTransformer,
|
||||||
|
cos_sim_threshold: float,
|
||||||
|
feature_obj_id: str = 'ObjektID',
|
||||||
|
model_input_feature: str = 'nlp_model_input',
|
||||||
|
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||||
|
logger.info('Obtaining timeline candidates...')
|
||||||
|
candidates = get_timeline_candidates_index(
|
||||||
|
data=data,
|
||||||
|
num_activities_per_obj_id=num_activities_per_obj_id,
|
||||||
|
model=model,
|
||||||
|
cos_sim_threshold=cos_sim_threshold,
|
||||||
|
feature_obj_id=feature_obj_id,
|
||||||
|
model_input_feature=model_input_feature,
|
||||||
|
)
|
||||||
|
tl_candidates = transform_timeline_candidates(candidates)
|
||||||
|
logger.info('Timeline candidates obtained successfully.')
|
||||||
|
# text mapping to obtain object descriptors
|
||||||
|
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
||||||
|
map_obj_text = map_obj_id_to_texts(
|
||||||
|
data=data,
|
||||||
|
feature_obj_id=feature_obj_id,
|
||||||
|
)
|
||||||
|
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||||
|
|
||||||
|
return tl_candidates, map_obj_text
|
||||||
|
|||||||
@ -1,56 +1,56 @@
|
|||||||
from typing import cast
|
|
||||||
import re
|
import re
|
||||||
from itertools import combinations
|
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
|
from itertools import combinations
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
from dateutil.parser import parse
|
from dateutil.parser import parse
|
||||||
from spacy.tokens.token import Token as SpacyToken
|
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
|
||||||
from spacy.lang.de import German as GermanSpacyModel
|
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
from spacy.lang.de import German as GermanSpacyModel
|
||||||
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
|
from spacy.tokens.token import Token as SpacyToken
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
from lang_main.loggers import logger_token_analysis as logger
|
|
||||||
from lang_main.analysis.graphs import (
|
from lang_main.analysis.graphs import (
|
||||||
update_graph,
|
|
||||||
TokenGraph,
|
TokenGraph,
|
||||||
|
update_graph,
|
||||||
)
|
)
|
||||||
|
from lang_main.loggers import logger_token_analysis as logger
|
||||||
|
|
||||||
# ** Logging
|
# ** Logging
|
||||||
#LOGGING_LEVEL = 'INFO'
|
# LOGGING_LEVEL = 'INFO'
|
||||||
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||||
#logger = logging.getLogger('ihm_analyse.token_analysis')
|
# logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||||
|
|
||||||
# ** POS
|
# ** POS
|
||||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||||
|
|
||||||
#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||||
|
|
||||||
# ** TAG
|
# ** TAG
|
||||||
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||||
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||||
|
|
||||||
|
|
||||||
# ** obtaining connection in texts
|
# ** obtaining connection in texts
|
||||||
|
|
||||||
def pre_clean_word(string: str) -> str:
|
|
||||||
|
|
||||||
|
def pre_clean_word(string: str) -> str:
|
||||||
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
||||||
string = re.sub(pattern, '', string)
|
string = re.sub(pattern, '', string)
|
||||||
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
||||||
def is_str_date(
|
def is_str_date(
|
||||||
string: str,
|
string: str,
|
||||||
fuzzy: bool = False,
|
fuzzy: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
#print(string)
|
# print(string)
|
||||||
try:
|
try:
|
||||||
# check if string is a number
|
# check if string is a number
|
||||||
# if length is greater than 8, it is not a date
|
# if length is greater than 8, it is not a date
|
||||||
@ -67,10 +67,10 @@ def is_str_date(
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def obtain_relevant_descendants(
|
def obtain_relevant_descendants(
|
||||||
token: SpacyToken,
|
token: SpacyToken,
|
||||||
) -> Iterator[SpacyToken]:
|
) -> Iterator[SpacyToken]:
|
||||||
|
|
||||||
for descendant in token.subtree:
|
for descendant in token.subtree:
|
||||||
# subtrees contain the token itself
|
# subtrees contain the token itself
|
||||||
# if current element is token skip this element
|
# if current element is token skip this element
|
||||||
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
|
|||||||
if is_str_date(string=descendant.text):
|
if is_str_date(string=descendant.text):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
|
logger.debug(
|
||||||
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
|
(
|
||||||
|
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
|
||||||
|
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# eliminate cases of cross-references with verbs
|
# eliminate cases of cross-references with verbs
|
||||||
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
|
if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
|
||||||
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
|
descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
# skip cases in which descendant is indirect POS with others than verbs
|
# skip cases in which descendant is indirect POS with others than verbs
|
||||||
elif descendant.pos_ in POS_INDIRECT:
|
elif descendant.pos_ in POS_INDIRECT:
|
||||||
@ -99,6 +104,7 @@ def obtain_relevant_descendants(
|
|||||||
|
|
||||||
# TODO look at results and fine-tune function accordingly
|
# TODO look at results and fine-tune function accordingly
|
||||||
|
|
||||||
|
|
||||||
def add_doc_info_to_graph(
|
def add_doc_info_to_graph(
|
||||||
graph: TokenGraph,
|
graph: TokenGraph,
|
||||||
doc: SpacyDoc,
|
doc: SpacyDoc,
|
||||||
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
|
|||||||
graph=graph,
|
graph=graph,
|
||||||
parent=token.lemma_,
|
parent=token.lemma_,
|
||||||
child=descendant.lemma_,
|
child=descendant.lemma_,
|
||||||
weight_connection=weight
|
weight_connection=weight,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# if indirect POS, make connection between all associated words
|
# if indirect POS, make connection between all associated words
|
||||||
@ -139,12 +145,13 @@ def add_doc_info_to_graph(
|
|||||||
weight_connection=weight,
|
weight_connection=weight,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_token_graph(
|
def build_token_graph(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
model: GermanSpacyModel,
|
model: GermanSpacyModel,
|
||||||
) -> tuple[TokenGraph]:
|
) -> tuple[TokenGraph]:
|
||||||
# empty NetworkX directed graph
|
# empty NetworkX directed graph
|
||||||
#graph = nx.DiGraph()
|
# graph = nx.DiGraph()
|
||||||
graph = TokenGraph()
|
graph = TokenGraph()
|
||||||
|
|
||||||
for row in tqdm(data.itertuples(), total=len(data)):
|
for row in tqdm(data.itertuples(), total=len(data)):
|
||||||
|
|||||||
55
src/lang_main/constants.py
Normal file
55
src/lang_main/constants.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
from lang_main import CONFIG
|
||||||
|
|
||||||
|
# ** paths
|
||||||
|
INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
|
||||||
|
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||||
|
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||||
|
# ** control
|
||||||
|
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||||
|
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||||
|
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||||
|
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
||||||
|
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||||
|
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
||||||
|
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
||||||
|
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||||
|
# ** export
|
||||||
|
|
||||||
|
# ** preprocessing
|
||||||
|
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
|
||||||
|
'filename_cossim_filter_candidates'
|
||||||
|
]
|
||||||
|
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||||
|
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
|
||||||
|
'threshold_amount_characters'
|
||||||
|
]
|
||||||
|
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||||
|
# ** token analysis
|
||||||
|
|
||||||
|
# ** graph postprocessing
|
||||||
|
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||||
|
# ** time analysis.uniqueness
|
||||||
|
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||||
|
'threshold_unique_texts'
|
||||||
|
]
|
||||||
|
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||||
|
'criterion_feature'
|
||||||
|
]
|
||||||
|
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||||
|
# ** time_analysis.model_input
|
||||||
|
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
|
||||||
|
CONFIG['time_analysis']['model_input']['input_features']
|
||||||
|
)
|
||||||
|
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
||||||
|
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
|
||||||
|
CONFIG['time_analysis']['model_input']['activity_types']
|
||||||
|
)
|
||||||
|
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
|
||||||
|
'threshold_num_acitivities'
|
||||||
|
]
|
||||||
|
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
|
||||||
|
'threshold_similarity'
|
||||||
|
]
|
||||||
56
src/lang_main/lang_main_config.toml
Normal file
56
src/lang_main/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './inputs/'
|
||||||
|
results = './results/test_new2/'
|
||||||
|
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||||
|
#results = './results/Export7/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
|
#results = './results/Export7_trunc/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
[control]
|
||||||
|
preprocessing = true
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
time_analysis = false
|
||||||
|
time_analysis_skip = false
|
||||||
|
|
||||||
|
#[export_filenames]
|
||||||
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsTypName',
|
||||||
|
'VorgangsArtText',
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
@ -1,5 +1,5 @@
|
|||||||
from typing import Final
|
|
||||||
import logging
|
import logging
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
from lang_main.types import LoggingLevels
|
from lang_main.types import LoggingLevels
|
||||||
|
|
||||||
|
|||||||
@ -1,20 +1,18 @@
|
|||||||
from typing import Any
|
|
||||||
#from types import FunctionType
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from lang_main.loggers import logger_pipelines as logger
|
from lang_main.loggers import logger_pipelines as logger
|
||||||
from lang_main.shared import save_pickle, load_pickle
|
from lang_main.shared import load_pickle, save_pickle
|
||||||
|
|
||||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||||
|
|
||||||
|
|
||||||
class NoPerformableActionError(Exception):
|
class NoPerformableActionError(Exception):
|
||||||
"""Error describing that no action is available in the current pipeline"""
|
"""Error describing that no action is available in the current pipeline"""
|
||||||
|
|
||||||
class BasePipeline():
|
|
||||||
|
|
||||||
|
class BasePipeline:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
@ -27,6 +25,8 @@ class BasePipeline():
|
|||||||
self.name = name
|
self.name = name
|
||||||
# working directory for pipeline == output path
|
# working directory for pipeline == output path
|
||||||
self.working_dir = working_dir
|
self.working_dir = working_dir
|
||||||
|
# if not self.working_dir.exists():
|
||||||
|
# self.working_dir.mkdir(parents=True)
|
||||||
|
|
||||||
# container for actions to perform during pass
|
# container for actions to perform during pass
|
||||||
self.actions: list[Callable] = []
|
self.actions: list[Callable] = []
|
||||||
@ -39,8 +39,10 @@ class BasePipeline():
|
|||||||
self._intermediate_result: Any | None = None
|
self._intermediate_result: Any | None = None
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return (f"{self.__class__.__name__}(name: {self.name}, "
|
return (
|
||||||
f"working dir: {self.working_dir}, contents: {self.action_names})")
|
f'{self.__class__.__name__}(name: {self.name}, '
|
||||||
|
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def intermediate_result(self) -> Any:
|
def intermediate_result(self) -> Any:
|
||||||
@ -53,15 +55,16 @@ class BasePipeline():
|
|||||||
save_result: bool = False,
|
save_result: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
# check explicitly for function type
|
# check explicitly for function type
|
||||||
#if isinstance(action, FunctionType):
|
# if isinstance(action, FunctionType):
|
||||||
if isinstance(action, Callable):
|
if isinstance(action, Callable):
|
||||||
self.actions.append(action)
|
self.actions.append(action)
|
||||||
self.action_names.append(action.__name__)
|
self.action_names.append(action.__name__)
|
||||||
self.actions_kwargs.append(action_kwargs.copy())
|
self.actions_kwargs.append(action_kwargs.copy())
|
||||||
self.is_save_result.append(save_result)
|
self.is_save_result.append(save_result)
|
||||||
else:
|
else:
|
||||||
raise TypeError(("Action must be custom function, "
|
raise TypeError(
|
||||||
f"but is of type >>{type(action)}<<."))
|
f'Action must be custom function, but is of type >>{type(action)}<<.'
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: add multiple entries by utilising simple add method
|
# TODO: add multiple entries by utilising simple add method
|
||||||
"""
|
"""
|
||||||
@ -107,13 +110,14 @@ class BasePipeline():
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def prep_run(self) -> None:
|
def prep_run(self) -> None:
|
||||||
logger.info(f"Starting processing pipeline >>{self.name}<<...")
|
logger.info(f'Starting processing pipeline >>{self.name}<<...')
|
||||||
# progress tracking
|
# progress tracking
|
||||||
self.curr_proc_idx = 1
|
self.curr_proc_idx = 1
|
||||||
# check if performable actions available
|
# check if performable actions available
|
||||||
if len(self.actions) == 0:
|
if len(self.actions) == 0:
|
||||||
raise NoPerformableActionError(("The pipeline does not contain any "
|
raise NoPerformableActionError(
|
||||||
"performable actions."))
|
('The pipeline does not contain any ' 'performable actions.')
|
||||||
|
)
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
@ -135,6 +139,6 @@ class BasePipeline():
|
|||||||
# processing tracking
|
# processing tracking
|
||||||
self.curr_proc_idx += 1
|
self.curr_proc_idx += 1
|
||||||
|
|
||||||
logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
|
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
@ -1,57 +1,144 @@
|
|||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
import spacy
|
import spacy
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
from lang_main import (
|
|
||||||
SAVE_PATH_FOLDER,
|
|
||||||
DATE_COLS,
|
|
||||||
FILENAME_COSSIM_FILTER_CANDIDATES,
|
|
||||||
THRESHOLD_SIMILARITY,
|
|
||||||
)
|
|
||||||
from lang_main.pipelines.base import BasePipeline
|
|
||||||
from lang_main.analysis.preprocessing import (
|
from lang_main.analysis.preprocessing import (
|
||||||
load_raw_data,
|
analyse_feature,
|
||||||
remove_duplicates,
|
|
||||||
remove_NA,
|
|
||||||
clean_string_slim,
|
clean_string_slim,
|
||||||
entry_wise_cleansing,
|
entry_wise_cleansing,
|
||||||
analyse_feature,
|
load_raw_data,
|
||||||
build_cosSim_matrix,
|
|
||||||
filt_thresh_cosSim_matrix,
|
|
||||||
list_cosSim_dupl_candidates,
|
|
||||||
merge_similarity_dupl,
|
merge_similarity_dupl,
|
||||||
|
remove_duplicates,
|
||||||
|
remove_NA,
|
||||||
|
)
|
||||||
|
from lang_main.analysis.timeline import (
|
||||||
|
filter_activities_per_obj_id,
|
||||||
|
generate_model_input,
|
||||||
|
get_timeline_candidates,
|
||||||
|
remove_non_relevant_obj_ids,
|
||||||
)
|
)
|
||||||
from lang_main.analysis.tokens import build_token_graph
|
from lang_main.analysis.tokens import build_token_graph
|
||||||
|
from lang_main.constants import (
|
||||||
|
ACTIVITY_FEATURE,
|
||||||
|
ACTIVITY_TYPES,
|
||||||
|
DATE_COLS,
|
||||||
|
FEATURE_NAME_OBJ_ID,
|
||||||
|
MODEL_INPUT_FEATURES,
|
||||||
|
SAVE_PATH_FOLDER,
|
||||||
|
THRESHOLD_NUM_ACTIVITIES,
|
||||||
|
THRESHOLD_SIMILARITY,
|
||||||
|
THRESHOLD_TIMELINE_SIMILARITY,
|
||||||
|
THRESHOLD_UNIQUE_TEXTS,
|
||||||
|
UNIQUE_CRITERION_FEATURE,
|
||||||
|
)
|
||||||
|
from lang_main.pipelines.base import BasePipeline
|
||||||
|
|
||||||
# ** pipeline configuration
|
# ** pipeline configuration
|
||||||
# ** target feature preparation
|
# ** target feature preparation
|
||||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||||
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
|
pipe_target_feat.add(
|
||||||
|
load_raw_data,
|
||||||
|
{
|
||||||
|
'date_cols': DATE_COLS,
|
||||||
|
},
|
||||||
|
)
|
||||||
pipe_target_feat.add(remove_duplicates)
|
pipe_target_feat.add(remove_duplicates)
|
||||||
pipe_target_feat.add(remove_NA, save_result=True)
|
pipe_target_feat.add(remove_NA, save_result=True)
|
||||||
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
|
pipe_target_feat.add(
|
||||||
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
|
entry_wise_cleansing,
|
||||||
|
{
|
||||||
|
'target_feature': 'VorgangsBeschreibung',
|
||||||
|
'cleansing_func': clean_string_slim,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_target_feat.add(
|
||||||
|
analyse_feature,
|
||||||
|
{
|
||||||
|
'target_feature': 'VorgangsBeschreibung',
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
)
|
||||||
# output: DataFrame containing target feature with
|
# output: DataFrame containing target feature with
|
||||||
# number of occurrences and associated ObjectIDs
|
# number of occurrences and associated ObjectIDs
|
||||||
|
|
||||||
# ** embedding pipe
|
# ** embedding pipe
|
||||||
|
# ?? still needed?
|
||||||
# using similarity between entries to catch duplicates with typo or similar content
|
# using similarity between entries to catch duplicates with typo or similar content
|
||||||
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||||
model_spacy = spacy.load('de_dep_news_trf')
|
model_spacy = spacy.load('de_dep_news_trf')
|
||||||
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
||||||
|
|
||||||
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||||
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
|
# pipe_embds.add(
|
||||||
pipe_embds.add(
|
# filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
|
||||||
list_cosSim_dupl_candidates,
|
# )
|
||||||
{'save_candidates': True,
|
# pipe_embds.add(
|
||||||
'saving_path': SAVE_PATH_FOLDER,
|
# list_cosSim_dupl_candidates,
|
||||||
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
# {
|
||||||
'pipeline': pipe_embds}, save_result=True)
|
# 'save_candidates': True,
|
||||||
|
# 'saving_path': SAVE_PATH_FOLDER,
|
||||||
|
# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||||
|
# 'pipeline': pipe_embds,
|
||||||
|
# },
|
||||||
|
# save_result=True,
|
||||||
|
# )
|
||||||
|
|
||||||
# ** Merge duplicates
|
# ** Merge duplicates
|
||||||
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
||||||
pipe_merge.add(merge_similarity_dupl, save_result=True)
|
# pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||||
|
pipe_merge.add(
|
||||||
|
merge_similarity_dupl,
|
||||||
|
{
|
||||||
|
'model': model_stfr,
|
||||||
|
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
)
|
||||||
|
|
||||||
# ** token analysis
|
# ** token analysis
|
||||||
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||||
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
|
pipe_token_analysis.add(
|
||||||
|
build_token_graph,
|
||||||
|
{
|
||||||
|
'model': model_spacy,
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ** timeline analysis
|
||||||
|
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||||
|
pipe_timeline.add(
|
||||||
|
remove_non_relevant_obj_ids,
|
||||||
|
{
|
||||||
|
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
|
||||||
|
'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
|
||||||
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
)
|
||||||
|
pipe_timeline.add(
|
||||||
|
generate_model_input,
|
||||||
|
{
|
||||||
|
'target_feature_name': 'nlp_model_input',
|
||||||
|
'model_input_features': MODEL_INPUT_FEATURES,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_timeline.add(
|
||||||
|
filter_activities_per_obj_id,
|
||||||
|
{
|
||||||
|
'activity_feature': ACTIVITY_FEATURE,
|
||||||
|
'relevant_activity_types': ACTIVITY_TYPES,
|
||||||
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
|
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_timeline.add(
|
||||||
|
get_timeline_candidates,
|
||||||
|
{
|
||||||
|
'model': model_stfr,
|
||||||
|
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
||||||
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
|
'model_input_feature': 'nlp_model_input',
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
)
|
||||||
|
|||||||
@ -1,38 +1,47 @@
|
|||||||
from typing import Any
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
import pickle
|
import pickle
|
||||||
|
import shutil
|
||||||
import tomllib
|
import tomllib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from lang_main.loggers import logger_shared_helpers as logger
|
from lang_main.loggers import logger_shared_helpers as logger
|
||||||
|
|
||||||
|
|
||||||
# ** Lib
|
# ** Lib
|
||||||
def create_saving_folder(
|
def create_saving_folder(
|
||||||
saving_path_folder: str | Path,
|
saving_path_folder: str | Path,
|
||||||
overwrite_existing: bool = False,
|
overwrite_existing: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
# check for existence of given path
|
# check for existence of given path
|
||||||
if not os.path.exists(saving_path_folder):
|
if isinstance(saving_path_folder, str):
|
||||||
os.makedirs(saving_path_folder)
|
saving_path_folder = Path(saving_path_folder)
|
||||||
|
if not saving_path_folder.exists():
|
||||||
|
saving_path_folder.mkdir(parents=True)
|
||||||
else:
|
else:
|
||||||
if overwrite_existing:
|
if overwrite_existing:
|
||||||
# overwrite if desired (deletes whole path and re-creates it)
|
# overwrite if desired (deletes whole path and re-creates it)
|
||||||
shutil.rmtree(saving_path_folder)
|
shutil.rmtree(saving_path_folder)
|
||||||
os.makedirs(saving_path_folder)
|
os.makedirs(saving_path_folder)
|
||||||
else:
|
else:
|
||||||
logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
|
logger.info(
|
||||||
"unchanged. If you want to overwrite this path, use parameter "
|
(
|
||||||
">>overwrite_existing<<."))
|
f'Path >>{saving_path_folder}<< already exists and remained '
|
||||||
|
f'unchanged. If you want to overwrite this path, use parameter '
|
||||||
|
f'>>overwrite_existing<<.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_toml_config(
|
def load_toml_config(
|
||||||
path_to_toml: str | Path,
|
path_to_toml: str | Path,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
with open(path_to_toml, "rb") as f:
|
with open(path_to_toml, 'rb') as f:
|
||||||
data = tomllib.load(f)
|
data = tomllib.load(f)
|
||||||
logger.info("Loaded TOML config file successfully.")
|
logger.info('Loaded TOML config file successfully.')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
# saving and loading using pickle
|
# saving and loading using pickle
|
||||||
# careful: pickling from unknown sources can be dangerous
|
# careful: pickling from unknown sources can be dangerous
|
||||||
def save_pickle(
|
def save_pickle(
|
||||||
@ -41,16 +50,18 @@ def save_pickle(
|
|||||||
) -> None:
|
) -> None:
|
||||||
with open(path, 'wb') as file:
|
with open(path, 'wb') as file:
|
||||||
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
logger.info(f"Saved file successfully under {path}")
|
logger.info(f'Saved file successfully under {path}')
|
||||||
|
|
||||||
|
|
||||||
def load_pickle(
|
def load_pickle(
|
||||||
path: str | Path,
|
path: str | Path,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
with open(path, 'rb') as file:
|
with open(path, 'rb') as file:
|
||||||
obj = pickle.load(file)
|
obj = pickle.load(file)
|
||||||
logger.info("Loaded file successfully.")
|
logger.info('Loaded file successfully.')
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
# TODO: remove, too specialised for common application
|
# TODO: remove, too specialised for common application
|
||||||
"""
|
"""
|
||||||
def filter_candidates_idx(
|
def filter_candidates_idx(
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import TypeAlias, Literal
|
from typing import Literal, TypeAlias
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
|
|||||||
@ -13,29 +13,25 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 2,
|
||||||
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
|
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"ename": "ModuleNotFoundError",
|
||||||
"output_type": "stream",
|
"evalue": "No module named 'ihm_analyse'",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
|
"traceback": [
|
||||||
]
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
},
|
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||||
{
|
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
|
||||||
"name": "stderr",
|
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
||||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from ihm_analyse import CONFIG\n",
|
"from lang_main import CONFIG\n",
|
||||||
"from ihm_analyse.lib.preprocess import (\n",
|
"from lang_main.lib.preprocess import (\n",
|
||||||
" load_raw_data,\n",
|
" load_raw_data,\n",
|
||||||
" remove_duplicates,\n",
|
" remove_duplicates,\n",
|
||||||
" remove_NA,\n",
|
" remove_NA,\n",
|
||||||
@ -47,8 +43,8 @@
|
|||||||
" list_cosSim_dupl_candidates,\n",
|
" list_cosSim_dupl_candidates,\n",
|
||||||
" merge_similarity_dupl,\n",
|
" merge_similarity_dupl,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
|
"from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||||||
"from ihm_analyse.lib.helpers import (\n",
|
"from lang_main.lib.helpers import (\n",
|
||||||
" save_pickle, \n",
|
" save_pickle, \n",
|
||||||
" load_pickle, \n",
|
" load_pickle, \n",
|
||||||
" create_saving_folder,\n",
|
" create_saving_folder,\n",
|
||||||
|
|||||||
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
Binary file not shown.
Binary file not shown.
@ -1,28 +1,42 @@
|
|||||||
from typing import cast
|
from typing import cast
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
from dash import (
|
from dash import (
|
||||||
Dash,
|
Dash,
|
||||||
html,
|
|
||||||
dcc,
|
|
||||||
callback,
|
|
||||||
Output,
|
|
||||||
Input,
|
Input,
|
||||||
|
Output,
|
||||||
State,
|
State,
|
||||||
|
callback,
|
||||||
dash_table,
|
dash_table,
|
||||||
|
dcc,
|
||||||
|
html,
|
||||||
)
|
)
|
||||||
import plotly.express as px
|
from lang_main import load_pickle
|
||||||
import pandas as pd
|
from lang_main.types import ObjectID, TimelineCandidates
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
|
||||||
from lang_main import load_pickle
|
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||||
from lang_main.types import TimelineCandidates, ObjectID
|
|
||||||
|
|
||||||
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
|
||||||
|
|
||||||
# ** data
|
# ** data
|
||||||
data = cast(DataFrame, load_pickle('./data.pkl'))
|
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||||
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
|
p_tl = Path(
|
||||||
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
|
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||||
|
)
|
||||||
|
ret = cast(DataFrame, load_pickle(p_df))
|
||||||
|
data = ret[0]
|
||||||
|
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||||
|
cands = ret[0]
|
||||||
|
texts = ret[1]
|
||||||
|
|
||||||
|
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
|
||||||
|
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
|
||||||
|
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
|
||||||
|
# data = cast(DataFrame, load_pickle(p_df))
|
||||||
|
# cands = cast(TimelineCandidates, load_pickle(p_cands))
|
||||||
|
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
|
||||||
|
|
||||||
table_feats = [
|
table_feats = [
|
||||||
'ErstellungsDatum',
|
'ErstellungsDatum',
|
||||||
'ErledigungsDatum',
|
'ErledigungsDatum',
|
||||||
@ -52,25 +66,28 @@ hover_data = {
|
|||||||
app = Dash(prevent_initial_callbacks=True)
|
app = Dash(prevent_initial_callbacks=True)
|
||||||
|
|
||||||
app.layout = [
|
app.layout = [
|
||||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
|
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
|
||||||
html.Div(children=[
|
html.Div(
|
||||||
|
children=[
|
||||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||||
dcc.Dropdown(
|
dcc.Dropdown(
|
||||||
list(cands.keys()),
|
list(cands.keys()),
|
||||||
id='dropdown-selection',
|
id='dropdown-selection',
|
||||||
placeholder="ObjektID auswählen...",
|
placeholder='ObjektID auswählen...',
|
||||||
)
|
),
|
||||||
]),
|
]
|
||||||
html.Div(children=[
|
),
|
||||||
|
html.Div(
|
||||||
|
children=[
|
||||||
html.H3(id='object_text'),
|
html.H3(id='object_text'),
|
||||||
dcc.Dropdown(id='choice-candidates'),
|
dcc.Dropdown(id='choice-candidates'),
|
||||||
dcc.Graph(id='graph-output'),
|
dcc.Graph(id='graph-output'),
|
||||||
]),
|
]
|
||||||
html.Div(children=[
|
),
|
||||||
dash_table.DataTable(id='table-candidates')
|
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
|
||||||
]),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
@callback(
|
||||||
Output('object_text', 'children'),
|
Output('object_text', 'children'),
|
||||||
Input('dropdown-selection', 'value'),
|
Input('dropdown-selection', 'value'),
|
||||||
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
|
|||||||
headline = f'HObjektText: {obj_text}'
|
headline = f'HObjektText: {obj_text}'
|
||||||
return headline
|
return headline
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
@callback(
|
||||||
Output('choice-candidates', 'options'),
|
Output('choice-candidates', 'options'),
|
||||||
Input('dropdown-selection', 'value'),
|
Input('dropdown-selection', 'value'),
|
||||||
@ -90,9 +108,10 @@ def update_obj_text(obj_id):
|
|||||||
def update_choice_candidates(obj_id):
|
def update_choice_candidates(obj_id):
|
||||||
obj_id = int(obj_id)
|
obj_id = int(obj_id)
|
||||||
cands_obj_id = cands[obj_id]
|
cands_obj_id = cands[obj_id]
|
||||||
choices = list(range(1, len(cands_obj_id)+1))
|
choices = list(range(1, len(cands_obj_id) + 1))
|
||||||
return choices
|
return choices
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
@callback(
|
||||||
Output('graph-output', 'figure'),
|
Output('graph-output', 'figure'),
|
||||||
Input('choice-candidates', 'value'),
|
Input('choice-candidates', 'value'),
|
||||||
@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
|
|||||||
title = f'HObjektText: {obj_text}'
|
title = f'HObjektText: {obj_text}'
|
||||||
# cands
|
# cands
|
||||||
cands_obj_id = cands[obj_id]
|
cands_obj_id = cands[obj_id]
|
||||||
cands_choice = cands_obj_id[int(index)-1]
|
cands_choice = cands_obj_id[int(index) - 1]
|
||||||
# data
|
# data
|
||||||
df = data.loc[list(cands_choice)].sort_index()
|
df = data.loc[list(cands_choice)].sort_index()
|
||||||
# figure
|
# figure
|
||||||
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
|
|||||||
title=title,
|
title=title,
|
||||||
hover_data=hover_data,
|
hover_data=hover_data,
|
||||||
)
|
)
|
||||||
fig.update_traces(
|
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
|
||||||
mode='markers+lines',
|
|
||||||
marker=markers,
|
|
||||||
marker_symbol='diamond'
|
|
||||||
)
|
|
||||||
fig.update_xaxes(
|
fig.update_xaxes(
|
||||||
tickformat="%B\n%Y",
|
tickformat='%B\n%Y',
|
||||||
rangeslider_visible=True,
|
rangeslider_visible=True,
|
||||||
)
|
)
|
||||||
fig.update_yaxes(type='category')
|
fig.update_yaxes(type='category')
|
||||||
fig.update_layout(hovermode="x unified")
|
fig.update_layout(hovermode='x unified')
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
@callback(
|
||||||
[Output('table-candidates', 'data'),
|
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
||||||
Output('table-candidates', 'columns')],
|
|
||||||
Input('choice-candidates', 'value'),
|
Input('choice-candidates', 'value'),
|
||||||
State('dropdown-selection', 'value'),
|
State('dropdown-selection', 'value'),
|
||||||
prevent_initial_call=True,
|
prevent_initial_call=True,
|
||||||
@ -141,13 +156,13 @@ def update_table_candidates(index, obj_id):
|
|||||||
obj_id = int(obj_id)
|
obj_id = int(obj_id)
|
||||||
# cands
|
# cands
|
||||||
cands_obj_id = cands[obj_id]
|
cands_obj_id = cands[obj_id]
|
||||||
cands_choice = cands_obj_id[int(index)-1]
|
cands_choice = cands_obj_id[int(index) - 1]
|
||||||
# data
|
# data
|
||||||
df = data.loc[list(cands_choice)].sort_index()
|
df = data.loc[list(cands_choice)].sort_index()
|
||||||
df = (df
|
df = df.filter(items=table_feats, axis=1).sort_values(
|
||||||
.filter(items=table_feats, axis=1)
|
by='ErstellungsDatum', ascending=True
|
||||||
.sort_values(by='ErstellungsDatum', ascending=True))
|
)
|
||||||
cols = [{"name": i, "id": i} for i in df.columns]
|
cols = [{'name': i, 'id': i} for i in df.columns]
|
||||||
# convert dates to strings
|
# convert dates to strings
|
||||||
for col in table_feats_dates:
|
for col in table_feats_dates:
|
||||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||||
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
|
|||||||
table_data = df.to_dict('records')
|
table_data = df.to_dict('records')
|
||||||
return table_data, cols
|
return table_data, cols
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './inputs/'
|
||||||
|
results = './results/test_new2/'
|
||||||
|
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||||
|
#results = './results/Export7/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
|
#results = './results/Export7_trunc/'
|
||||||
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
[control]
|
||||||
|
preprocessing = true
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
time_analysis = false
|
||||||
|
time_analysis_skip = false
|
||||||
|
|
||||||
|
#[export_filenames]
|
||||||
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsTypName',
|
||||||
|
'VorgangsArtText',
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
Binary file not shown.
Binary file not shown.
663
test-notebooks/display_results.ipynb
Normal file
663
test-notebooks/display_results.ipynb
Normal file
@ -0,0 +1,663 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"from lang_main import load_pickle"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_files(path: str) -> tuple[Path, ...]:\n",
|
||||||
|
" p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||||
|
" assert p.exists(), \"path does not exist\"\n",
|
||||||
|
" return tuple(p.glob(r'*'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 87,
|
||||||
|
"id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 87,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||||
|
"files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 88,
|
||||||
|
"id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"file = files[-1]\n",
|
||||||
|
"ret = load_pickle(file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 89,
|
||||||
|
"id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>entry</th>\n",
|
||||||
|
" <th>len</th>\n",
|
||||||
|
" <th>num_occur</th>\n",
|
||||||
|
" <th>assoc_obj_ids</th>\n",
|
||||||
|
" <th>num_assoc_obj_ids</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>162</th>\n",
|
||||||
|
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||||
|
" <td>66</td>\n",
|
||||||
|
" <td>92592</td>\n",
|
||||||
|
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||||
|
" <td>206</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>33</th>\n",
|
||||||
|
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||||
|
" <td>39</td>\n",
|
||||||
|
" <td>3108</td>\n",
|
||||||
|
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||||
|
" <td>74</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>131</th>\n",
|
||||||
|
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||||
|
" <td>37</td>\n",
|
||||||
|
" <td>1619</td>\n",
|
||||||
|
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>160</th>\n",
|
||||||
|
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||||
|
" <td>36</td>\n",
|
||||||
|
" <td>1265</td>\n",
|
||||||
|
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||||
|
" <td>11</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>140</th>\n",
|
||||||
|
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||||
|
" <td>44</td>\n",
|
||||||
|
" <td>687</td>\n",
|
||||||
|
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||||
|
" <td>166</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2559</th>\n",
|
||||||
|
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||||
|
" <td>46</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[211]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2558</th>\n",
|
||||||
|
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||||
|
" <td>30</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[93]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2557</th>\n",
|
||||||
|
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||||
|
" <td>40</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[1707]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2556</th>\n",
|
||||||
|
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||||
|
" <td>173</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[1]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6782</th>\n",
|
||||||
|
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||||
|
" <td>106</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>[306, 326]</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>4545 rows × 5 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" entry ... num_assoc_obj_ids\n",
|
||||||
|
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n",
|
||||||
|
"33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n",
|
||||||
|
"131 Tägliche Überprüfung der Ölabscheider ... 4\n",
|
||||||
|
"160 Wöchentliche Kontrolle der C-Anlagen ... 11\n",
|
||||||
|
"140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n",
|
||||||
|
"... ... ... ...\n",
|
||||||
|
"2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n",
|
||||||
|
"2558 T-Warp-Let-Off1 schleppfehler ... 1\n",
|
||||||
|
"2557 Fahrräder wurden gewartet und gereinigt. ... 1\n",
|
||||||
|
"2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n",
|
||||||
|
"6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n",
|
||||||
|
"\n",
|
||||||
|
"[4545 rows x 5 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 89,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"ret[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 69,
|
||||||
|
"id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 69,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||||
|
"files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 70,
|
||||||
|
"id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"file = files[-1]\n",
|
||||||
|
"ret = load_pickle(file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 71,
|
||||||
|
"id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>index</th>\n",
|
||||||
|
" <th>entry</th>\n",
|
||||||
|
" <th>len</th>\n",
|
||||||
|
" <th>num_occur</th>\n",
|
||||||
|
" <th>assoc_obj_ids</th>\n",
|
||||||
|
" <th>num_assoc_obj_ids</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>162</td>\n",
|
||||||
|
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||||
|
" <td>66</td>\n",
|
||||||
|
" <td>92592</td>\n",
|
||||||
|
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||||
|
" <td>206</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>33</td>\n",
|
||||||
|
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||||
|
" <td>39</td>\n",
|
||||||
|
" <td>3108</td>\n",
|
||||||
|
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||||
|
" <td>74</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>131</td>\n",
|
||||||
|
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||||
|
" <td>37</td>\n",
|
||||||
|
" <td>1619</td>\n",
|
||||||
|
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>160</td>\n",
|
||||||
|
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||||
|
" <td>36</td>\n",
|
||||||
|
" <td>1265</td>\n",
|
||||||
|
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||||
|
" <td>11</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>140</td>\n",
|
||||||
|
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||||
|
" <td>44</td>\n",
|
||||||
|
" <td>687</td>\n",
|
||||||
|
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||||
|
" <td>166</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6756</th>\n",
|
||||||
|
" <td>2559</td>\n",
|
||||||
|
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||||
|
" <td>46</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[211]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6757</th>\n",
|
||||||
|
" <td>2558</td>\n",
|
||||||
|
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||||
|
" <td>30</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[93]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6758</th>\n",
|
||||||
|
" <td>2557</td>\n",
|
||||||
|
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||||
|
" <td>40</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[1707]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6759</th>\n",
|
||||||
|
" <td>2556</td>\n",
|
||||||
|
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||||
|
" <td>173</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>[1]</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6760</th>\n",
|
||||||
|
" <td>6782</td>\n",
|
||||||
|
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||||
|
" <td>106</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>[306, 326]</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>4545 rows × 6 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" index ... num_assoc_obj_ids\n",
|
||||||
|
"0 162 ... 206\n",
|
||||||
|
"1 33 ... 74\n",
|
||||||
|
"2 131 ... 4\n",
|
||||||
|
"3 160 ... 11\n",
|
||||||
|
"4 140 ... 166\n",
|
||||||
|
"... ... ... ...\n",
|
||||||
|
"6756 2559 ... 1\n",
|
||||||
|
"6757 2558 ... 1\n",
|
||||||
|
"6758 2557 ... 1\n",
|
||||||
|
"6759 2556 ... 1\n",
|
||||||
|
"6760 6782 ... 2\n",
|
||||||
|
"\n",
|
||||||
|
"[4545 rows x 6 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 71,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"ret[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 72,
|
||||||
|
"id": "cbf0b450-ec00-471f-9627-717e52c5471d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from tqdm.auto import tqdm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 84,
|
||||||
|
"id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tup = tuple(i for i in range(100000000))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 85,
|
||||||
|
"id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "6ade9c6f4e61410fb93f35e43222705b",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/100000000 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"num = 0\n",
|
||||||
|
"for i in tqdm(tup):\n",
|
||||||
|
" num += i"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 86,
|
||||||
|
"id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4999999950000000"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 86,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"num"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "36366147-3632-4518-936e-878563305e49",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"id": "17156aa0-8fd6-407b-b014-698df0e534a9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"arr = np.random.rand(1000,1000)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"idx = np.argwhere(arr >= 0.97)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"30220"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 50,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(idx)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 66,
|
||||||
|
"id": "5b78436e-a828-42bd-a5ed-ae6045349391",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"batch = idx[:200]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"id": "75edc50e-b64c-4319-8f74-27653ed3452c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%timeit\n",
|
||||||
|
"tuple(map(tuple, batch))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 68,
|
||||||
|
"id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%timeit\n",
|
||||||
|
"tuple(tuple(x) for x in batch)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user