STRF for similarity duplicates, time analysis pipeline, enhanced config
This commit is contained in:
parent
5d2c97165a
commit
bb987e2108
@ -34,3 +34,15 @@ trials = [
|
||||
"plotly>=5.22.0",
|
||||
"dash>=2.17.0",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 94
|
||||
indent-width = 4
|
||||
target-version = "py311"
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "single"
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I"]
|
||||
@ -1,33 +1,43 @@
|
||||
import typing
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
from ihm_analyse import (
|
||||
SAVE_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
DO_PREPROCESSING,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
from lang_main import (
|
||||
TokenGraph,
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
Embedding,
|
||||
Index,
|
||||
TokenGraph,
|
||||
)
|
||||
from ihm_analyse.predefined_pipes import (
|
||||
pipe_target_feat,
|
||||
pipe_embds,
|
||||
from lang_main.constants import (
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
DO_PREPROCESSING,
|
||||
DO_TIME_ANALYSIS,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
INPUT_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
SAVE_PATH_FOLDER,
|
||||
SKIP_GRAPH_POSTPROCESSING,
|
||||
SKIP_PREPROCESSING,
|
||||
SKIP_TIME_ANALYSIS,
|
||||
SKIP_TOKEN_ANALYSIS,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
)
|
||||
|
||||
# Embedding,
|
||||
# PandasIndex,
|
||||
from lang_main.pipelines.predefined import (
|
||||
pipe_merge,
|
||||
pipe_target_feat,
|
||||
pipe_timeline,
|
||||
pipe_token_analysis,
|
||||
)
|
||||
"""
|
||||
# ** config parameters
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
|
||||
"""
|
||||
from lang_main.types import (
|
||||
ObjectID,
|
||||
TimelineCandidates,
|
||||
)
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
|
||||
# ** processing pipeline
|
||||
def run_preprocessing() -> DataFrame:
|
||||
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
|
||||
overwrite_existing=True,
|
||||
)
|
||||
# run pipelines
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
||||
)
|
||||
target_feat_data = ret[0]
|
||||
# only entries with more than threshold amount of characters
|
||||
data_filter = typing.cast(Series,
|
||||
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
dupl_idx_pairs, embds = typing.cast(
|
||||
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
|
||||
pipe_embds.run(starting_values=(subset_data,))
|
||||
)
|
||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
# dupl_idx_pairs, embds = typing.cast(
|
||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||
# pipe_embds.run(starting_values=(subset_data,)),
|
||||
# )
|
||||
# merge duplicates, results saved separately
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
|
||||
subset_data = target_feat_data.loc[data_filter].copy()
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame],
|
||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||
pipe_merge.run(starting_values=(subset_data,)),
|
||||
)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
|
||||
return preprocessed_data
|
||||
|
||||
|
||||
def run_token_analysis(
|
||||
preprocessed_data: DataFrame,
|
||||
) -> TokenGraph:
|
||||
# build token graph
|
||||
(tk_graph,) = typing.cast(tuple[TokenGraph],
|
||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
|
||||
(tk_graph,) = typing.cast(
|
||||
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
|
||||
)
|
||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
|
||||
return tk_graph
|
||||
|
||||
|
||||
def run_graph_postprocessing(
|
||||
tk_graph: TokenGraph,
|
||||
) -> TokenGraph:
|
||||
# filter graph by edge weight and remove single nodes (no connection)
|
||||
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
||||
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
||||
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
|
||||
filename='TokenGraph-filtered',
|
||||
directed=False)
|
||||
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
|
||||
|
||||
tk_graph_filtered.save_graph(
|
||||
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
||||
)
|
||||
tk_graph_filtered.to_pickle(
|
||||
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
)
|
||||
|
||||
return tk_graph_filtered
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
filename = 'without_nan'
|
||||
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
ret = load_pickle(loading_path)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
ret = cast(
|
||||
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def verify_path(
|
||||
loading_path: Path,
|
||||
) -> None:
|
||||
if not loading_path.exists():
|
||||
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
|
||||
|
||||
|
||||
def main() -> None:
|
||||
pre_step_skipped: bool = False
|
||||
# ** preprocess
|
||||
if DO_PREPROCESSING:
|
||||
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
|
||||
preprocessed_data = run_preprocessing()
|
||||
else:
|
||||
elif not SKIP_PREPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
||||
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
load_pickle(target_filepath))
|
||||
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
verify_path(loading_path)
|
||||
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
|
||||
preprocessed_data = ret[0]
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS:
|
||||
preprocessed_data_trunc = typing.cast(DataFrame,
|
||||
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No preprocessing action selected. Skipped.')
|
||||
# sys.exit(0)
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
'Preprocessing step skipped. Token analysis cannot be performed.'
|
||||
)
|
||||
preprocessed_data_trunc = typing.cast(
|
||||
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
|
||||
) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
elif not SKIP_TOKEN_ANALYSIS:
|
||||
# !! hardcoded result filenames
|
||||
# whole graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
#tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph = TokenGraph.from_pickle(loading_path)
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING:
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No token analysis action selected. Skipped.')
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
(
|
||||
'Preprocessing or token analysis step skipped. '
|
||||
'Graph postprocessing cannot be performed.'
|
||||
)
|
||||
)
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
elif not SKIP_GRAPH_POSTPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
# filtered graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
#tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
warnings.warn('No graph postprocessing action selected. Skipped.')
|
||||
# ** time analysis
|
||||
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
|
||||
# no check for fails, runs separately
|
||||
ret = run_time_analysis()
|
||||
elif not SKIP_TIME_ANALYSIS:
|
||||
...
|
||||
else:
|
||||
warnings.warn('No time analysis action selected. Skipped.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
BIN
scripts/inputs/without_nan.pkl
Normal file
BIN
scripts/inputs/without_nan.pkl
Normal file
Binary file not shown.
@ -1,17 +1,21 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = false
|
||||
token_analysis = true
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
59
scripts/lang_main_config.toml
Normal file
59
scripts/lang_main_config.toml
Normal file
@ -0,0 +1,59 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = true
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
time_analysis = true
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
12
scripts/test.py
Normal file
12
scripts/test.py
Normal file
@ -0,0 +1,12 @@
|
||||
from lang_main.analysis.preprocessing import clean_string_slim
|
||||
from lang_main.constants import SAVE_PATH_FOLDER
|
||||
|
||||
print(SAVE_PATH_FOLDER)
|
||||
txt = """
|
||||
Wir feiern den Jahrestag, olé!
|
||||
tel:::: !!!!???? +++49 123 456 789
|
||||
|
||||
Doch leben wir länger.
|
||||
"""
|
||||
print(txt)
|
||||
print(clean_string_slim(txt))
|
||||
@ -1,18 +1,19 @@
|
||||
from typing import Final, Any
|
||||
import inspect
|
||||
import sys
|
||||
import logging
|
||||
from time import gmtime
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Any, Final
|
||||
|
||||
from lang_main.shared import (
|
||||
save_pickle,
|
||||
load_pickle,
|
||||
create_saving_folder,
|
||||
load_toml_config,
|
||||
)
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.shared import (
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
load_toml_config,
|
||||
save_pickle,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'save_pickle',
|
||||
@ -32,37 +33,30 @@ logging.basicConfig(
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||
|
||||
pkg_dir = Path(__file__).parent
|
||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||
|
||||
# load config data: internal/external
|
||||
if USE_INTERNAL_CONFIG:
|
||||
curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
|
||||
pkg_dir = curr_file_dir.parent
|
||||
config_path = Path(pkg_dir, 'config.toml')
|
||||
loaded_config = load_toml_config(path_to_toml=config_path)
|
||||
CONFIG: Final[dict[str, Any]] = loaded_config.copy()
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||
else:
|
||||
raise NotImplementedError("External config data not implemented yet.")
|
||||
caller_file = Path(inspect.stack()[-1].filename)
|
||||
if not caller_file.exists():
|
||||
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
||||
cfg_path_external = caller_file.parent / CONFIG_FILENAME
|
||||
if not cfg_path_external.exists():
|
||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||
sys.exit(
|
||||
(
|
||||
'No config file was found. A new one with default values was created '
|
||||
'in the execution path. Please fill in the necessary values and '
|
||||
'restart the programm.'
|
||||
)
|
||||
)
|
||||
# raise NotImplementedError("External config data not implemented yet.")
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||
|
||||
# ** paths
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
# ** control
|
||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||
# ** export
|
||||
|
||||
# ** preprocessing
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
||||
CONFIG['preprocess']['filename_cossim_filter_candidates']
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
|
||||
CONFIG['preprocess']['threshold_amount_characters']
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
# ** token analysis
|
||||
|
||||
# ** graph postprocessing
|
||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
# ** time analysis
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
|
||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
import typing
|
||||
from typing import Any, Self, Literal, overload, Final
|
||||
import sys
|
||||
from collections.abc import Hashable
|
||||
from pathlib import Path
|
||||
import copy
|
||||
import sys
|
||||
import typing
|
||||
from collections.abc import Hashable, Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Final, Literal, Self, overload
|
||||
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from networkx import Graph, DiGraph
|
||||
import networkx as nx
|
||||
from networkx import DiGraph, Graph
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
|
||||
# TODO change logging behaviour, add logging to file
|
||||
LOGGING_DEFAULT: Final[bool] = False
|
||||
@ -31,18 +31,17 @@ def get_graph_metadata(
|
||||
min_edge_weight: int = 1_000_000
|
||||
max_edge_weight: int = 0
|
||||
for edge in graph.edges:
|
||||
weight = typing.cast(int,
|
||||
graph[edge[0]][edge[1]]['weight'])
|
||||
weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
|
||||
if weight < min_edge_weight:
|
||||
min_edge_weight = weight
|
||||
if weight > max_edge_weight:
|
||||
max_edge_weight = weight
|
||||
|
||||
|
||||
# memory
|
||||
edge_mem = sum([sys.getsizeof(e) for e in graph.edges])
|
||||
node_mem = sum([sys.getsizeof(n) for n in graph.nodes])
|
||||
total_mem = edge_mem + node_mem
|
||||
|
||||
|
||||
graph_info.update(
|
||||
num_nodes=num_nodes,
|
||||
num_edges=num_edges,
|
||||
@ -52,20 +51,22 @@ def get_graph_metadata(
|
||||
edge_memory=edge_mem,
|
||||
total_memory=total_mem,
|
||||
)
|
||||
|
||||
|
||||
if logging:
|
||||
logger.info((f"Graph properties: {num_nodes} Nodes, "
|
||||
f"{num_edges} Edges"))
|
||||
logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
|
||||
logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
|
||||
logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
|
||||
|
||||
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
|
||||
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
|
||||
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
|
||||
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
|
||||
|
||||
return graph_info
|
||||
|
||||
|
||||
def update_graph(
|
||||
graph: Graph | DiGraph,
|
||||
parent: Hashable,
|
||||
child: Hashable,
|
||||
*,
|
||||
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
|
||||
parent: Hashable | None = None,
|
||||
child: Hashable | None = None,
|
||||
weight_connection: int = 1,
|
||||
) -> None:
|
||||
# !! not necessary to check for existence of nodes
|
||||
@ -78,7 +79,9 @@ def update_graph(
|
||||
graph.add_node(child)
|
||||
"""
|
||||
# check if edge not in Graph
|
||||
if not graph.has_edge(parent, child):
|
||||
if batch is not None:
|
||||
graph.add_edges_from(batch, weight=weight_connection)
|
||||
elif not graph.has_edge(parent, child):
|
||||
# create new edge, nodes will be created if not already present
|
||||
graph.add_edge(parent, child, weight=weight_connection)
|
||||
else:
|
||||
@ -87,40 +90,38 @@ def update_graph(
|
||||
weight += weight_connection
|
||||
graph[parent][child]['weight'] = weight
|
||||
|
||||
|
||||
# build undirected adjacency matrix
|
||||
def convert_graph_to_undirected(
|
||||
graph: DiGraph,
|
||||
logging: bool = LOGGING_DEFAULT,
|
||||
) -> Graph:
|
||||
# get adjacency matrix
|
||||
adj_mat = typing.cast(DataFrame,
|
||||
nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||
arr = typing.cast(npt.NDArray[np.uint32],
|
||||
adj_mat.to_numpy())
|
||||
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
|
||||
# build undirected array: adding edges of lower triangular matrix to upper one
|
||||
arr_upper = np.triu(arr)
|
||||
arr_lower = np.tril(arr)
|
||||
arr_lower = np.rot90(np.fliplr(arr_lower))
|
||||
arr_new = arr_upper + arr_lower
|
||||
# assign new data and create graph
|
||||
adj_mat.loc[:] = arr_new # type: ignore
|
||||
graph_undir = typing.cast(Graph,
|
||||
nx.from_pandas_adjacency(df=adj_mat))
|
||||
|
||||
adj_mat.loc[:] = arr_new # type: ignore
|
||||
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
|
||||
|
||||
# info about graph
|
||||
if logging:
|
||||
logger.info("Successfully converted graph to one with undirected edges.")
|
||||
logger.info('Successfully converted graph to one with undirected edges.')
|
||||
_ = get_graph_metadata(graph=graph_undir, logging=logging)
|
||||
|
||||
|
||||
return graph_undir
|
||||
|
||||
|
||||
class TokenGraph(DiGraph):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str = 'TokenGraph',
|
||||
enable_logging: bool = True,
|
||||
incoming_graph_data: Any| None = None,
|
||||
incoming_graph_data: Any | None = None,
|
||||
**attr,
|
||||
) -> None:
|
||||
super().__init__(incoming_graph_data, **attr)
|
||||
@ -133,15 +134,17 @@ class TokenGraph(DiGraph):
|
||||
self._metadata_directed: dict[str, int] = {}
|
||||
self._undirected: Graph | None = None
|
||||
self._metadata_undirected: dict[str, int] = {}
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (f"TokenGraph(name: {self.name}, number of nodes: "
|
||||
f"{len(self.nodes)}, number of edges: "
|
||||
f"{len(self.edges)})")
|
||||
|
||||
return (
|
||||
f'TokenGraph(name: {self.name}, number of nodes: '
|
||||
f'{len(self.nodes)}, number of edges: '
|
||||
f'{len(self.edges)})'
|
||||
)
|
||||
|
||||
# !! only used to verify that saving was done correctly
|
||||
"""
|
||||
def __key(self) -> tuple[Hashable, ...]:
|
||||
@ -150,7 +153,7 @@ class TokenGraph(DiGraph):
|
||||
def __hash__(self) -> int:
|
||||
return hash(self.__key())
|
||||
"""
|
||||
|
||||
|
||||
def copy(self) -> Self:
|
||||
"""returns a (deep) copy of the graph
|
||||
|
||||
@ -160,51 +163,46 @@ class TokenGraph(DiGraph):
|
||||
deep copy of the graph
|
||||
"""
|
||||
return copy.deepcopy(self)
|
||||
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self._name
|
||||
|
||||
|
||||
@property
|
||||
def directed(self) -> Self:
|
||||
return self._directed
|
||||
|
||||
|
||||
@property
|
||||
def undirected(self) -> Graph | None:
|
||||
return self._undirected
|
||||
|
||||
|
||||
@property
|
||||
def metadata_directed(self) -> dict[str, int]:
|
||||
return self._metadata_directed
|
||||
|
||||
|
||||
@property
|
||||
def metadata_undirected(self) -> dict[str, int]:
|
||||
return self._metadata_undirected
|
||||
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
self,
|
||||
inplace: Literal[True] = ...,
|
||||
logging: bool | None = ...,
|
||||
) -> None:
|
||||
...
|
||||
|
||||
) -> None: ...
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
self,
|
||||
inplace: Literal[False],
|
||||
logging: bool | None = ...,
|
||||
) -> Graph:
|
||||
...
|
||||
|
||||
) -> Graph: ...
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace: bool = ...,
|
||||
logging: bool | None = ...
|
||||
) -> Graph | None:
|
||||
...
|
||||
|
||||
self, inplace: bool = ..., logging: bool | None = ...
|
||||
) -> Graph | None: ...
|
||||
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace=True,
|
||||
@ -212,27 +210,27 @@ class TokenGraph(DiGraph):
|
||||
) -> Graph | None:
|
||||
if logging is None:
|
||||
logging = self.logging
|
||||
|
||||
self._undirected = convert_graph_to_undirected(graph=self,
|
||||
logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
||||
logging=logging)
|
||||
|
||||
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(
|
||||
graph=self._undirected, logging=logging
|
||||
)
|
||||
if not inplace:
|
||||
return self._undirected
|
||||
|
||||
|
||||
def update_metadata(
|
||||
self,
|
||||
logging: bool | None = None,
|
||||
) -> None:
|
||||
if logging is None:
|
||||
logging = self.logging
|
||||
|
||||
self._metadata_directed = get_graph_metadata(graph=self,
|
||||
logging=logging)
|
||||
|
||||
self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
|
||||
if self._undirected is not None:
|
||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
||||
logging=logging)
|
||||
|
||||
self._metadata_undirected = get_graph_metadata(
|
||||
graph=self._undirected, logging=logging
|
||||
)
|
||||
|
||||
def filter_by_edge_weight(
|
||||
self,
|
||||
threshold: int,
|
||||
@ -252,20 +250,19 @@ class TokenGraph(DiGraph):
|
||||
# filter edges by weight
|
||||
original_graph_edges = copy.deepcopy(self.edges)
|
||||
filtered_graph = self.copy()
|
||||
|
||||
|
||||
for edge in original_graph_edges:
|
||||
weight = typing.cast(int,
|
||||
filtered_graph[edge[0]][edge[1]]['weight'])
|
||||
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
|
||||
if weight < threshold:
|
||||
filtered_graph.remove_edge(edge[0], edge[1])
|
||||
|
||||
|
||||
if filtered_graph._undirected is not None:
|
||||
filtered_graph.to_undirected(inplace=True, logging=False)
|
||||
|
||||
|
||||
filtered_graph.update_metadata(logging=False)
|
||||
|
||||
|
||||
return filtered_graph
|
||||
|
||||
|
||||
def filter_by_node_degree(
|
||||
self,
|
||||
threshold: int,
|
||||
@ -285,31 +282,31 @@ class TokenGraph(DiGraph):
|
||||
# filter nodes by degree
|
||||
original_graph_nodes = copy.deepcopy(self.nodes)
|
||||
filtered_graph = self.copy()
|
||||
|
||||
|
||||
for node in original_graph_nodes:
|
||||
degree = filtered_graph.degree[node] # type: ignore
|
||||
degree = filtered_graph.degree[node] # type: ignore
|
||||
if degree < threshold:
|
||||
filtered_graph.remove_node(node)
|
||||
|
||||
|
||||
if filtered_graph._undirected is not None:
|
||||
filtered_graph.to_undirected(inplace=True, logging=False)
|
||||
|
||||
|
||||
filtered_graph.update_metadata(logging=False)
|
||||
|
||||
|
||||
return filtered_graph
|
||||
|
||||
|
||||
def _save_prepare(
|
||||
self,
|
||||
path: Path,
|
||||
filename: str | None = None,
|
||||
) -> Path:
|
||||
if filename is not None:
|
||||
saving_path = path.joinpath(f"{filename}")
|
||||
saving_path = path.joinpath(f'{filename}')
|
||||
else:
|
||||
saving_path = path.joinpath(f"{self.name}")
|
||||
|
||||
saving_path = path.joinpath(f'{self.name}')
|
||||
|
||||
return saving_path
|
||||
|
||||
|
||||
def save_graph(
|
||||
self,
|
||||
path: Path,
|
||||
@ -335,19 +332,18 @@ class TokenGraph(DiGraph):
|
||||
undirected graph should be exported but is not available
|
||||
"""
|
||||
saving_path = self._save_prepare(path=path, filename=filename)
|
||||
|
||||
|
||||
if directed:
|
||||
target_graph = self._directed
|
||||
elif not directed and self._undirected is not None:
|
||||
target_graph = self._undirected
|
||||
else:
|
||||
raise ValueError("No undirected graph available.")
|
||||
|
||||
raise ValueError('No undirected graph available.')
|
||||
|
||||
saving_path = saving_path.with_suffix('.graphml')
|
||||
nx.write_graphml(G=target_graph, path=saving_path)
|
||||
logger.info(("Successfully saved graph as GraphML file "
|
||||
f"under {saving_path}."))
|
||||
|
||||
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
|
||||
|
||||
def to_pickle(
|
||||
self,
|
||||
path: Path,
|
||||
@ -365,7 +361,7 @@ class TokenGraph(DiGraph):
|
||||
saving_path = self._save_prepare(path=path, filename=filename)
|
||||
saving_path = saving_path.with_suffix('.pickle')
|
||||
save_pickle(obj=self, path=saving_path)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_file(
|
||||
cls,
|
||||
@ -378,15 +374,15 @@ class TokenGraph(DiGraph):
|
||||
match path.suffix:
|
||||
case '.graphml':
|
||||
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
||||
logger.info(f"Successfully loaded graph from GraphML file {path}.")
|
||||
logger.info(f'Successfully loaded graph from GraphML file {path}.')
|
||||
case '.pkl' | '.pickle':
|
||||
graph = typing.cast(Self, load_pickle(path))
|
||||
logger.info(f"Successfully loaded graph from pickle file {path}.")
|
||||
logger.info(f'Successfully loaded graph from pickle file {path}.')
|
||||
case _:
|
||||
raise ValueError("File format not supported.")
|
||||
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_pickle(
|
||||
cls,
|
||||
@ -394,10 +390,10 @@ class TokenGraph(DiGraph):
|
||||
) -> Self:
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
|
||||
|
||||
if path.suffix not in ('.pkl', '.pickle'):
|
||||
raise ValueError("File format not supported.")
|
||||
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
graph = typing.cast(Self, load_pickle(path))
|
||||
|
||||
return graph
|
||||
|
||||
return graph
|
||||
|
||||
@ -1,29 +1,29 @@
|
||||
from typing import cast, Callable
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from itertools import combinations
|
||||
import re
|
||||
from math import factorial
|
||||
from pathlib import Path
|
||||
from typing import Callable, cast
|
||||
|
||||
import numpy as np
|
||||
from torch import Tensor
|
||||
from pandas import DataFrame, Series
|
||||
import pandas as pd
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from pandas import DataFrame, Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
from tqdm import tqdm
|
||||
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.shared import (
|
||||
candidates_by_index,
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
@ -45,7 +45,7 @@ def load_raw_data(
|
||||
path : str
|
||||
path to dataset file, usually CSV file
|
||||
date_cols : list[str], optional
|
||||
columns which contain dates and are parsed as such,
|
||||
columns which contain dates and are parsed as such,
|
||||
by default (
|
||||
'VorgangsDatum',
|
||||
'ErledigungsDatum',
|
||||
@ -61,17 +61,22 @@ def load_raw_data(
|
||||
# load dataset
|
||||
date_cols = list(date_cols)
|
||||
data = pd.read_csv(
|
||||
filepath_or_buffer=path,
|
||||
sep=';',
|
||||
encoding='cp1252',
|
||||
parse_dates=date_cols,
|
||||
filepath_or_buffer=path,
|
||||
sep=';',
|
||||
encoding='cp1252',
|
||||
parse_dates=date_cols,
|
||||
dayfirst=True,
|
||||
)
|
||||
logger.info("Loaded dataset successfully.")
|
||||
logger.info((f"Dataset properties: number of entries: {len(data)}, "
|
||||
f"number of features {len(data.columns)}"))
|
||||
logger.info('Loaded dataset successfully.')
|
||||
logger.info(
|
||||
(
|
||||
f'Dataset properties: number of entries: {len(data)}, '
|
||||
f'number of features {len(data.columns)}'
|
||||
)
|
||||
)
|
||||
return (data,)
|
||||
|
||||
|
||||
def remove_duplicates(
|
||||
data: DataFrame,
|
||||
) -> tuple[DataFrame]:
|
||||
@ -89,7 +94,7 @@ def remove_duplicates(
|
||||
"""
|
||||
# obtain info about duplicates over all features
|
||||
duplicates_filt = data.duplicated()
|
||||
logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
|
||||
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
|
||||
# drop duplicates
|
||||
wo_duplicates = data.drop_duplicates(ignore_index=True)
|
||||
duplicates_subset: list[str] = [
|
||||
@ -97,16 +102,26 @@ def remove_duplicates(
|
||||
'ObjektID',
|
||||
]
|
||||
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
|
||||
logger.info(("Number of duplicates over subset "
|
||||
f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
|
||||
wo_duplicates =\
|
||||
wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
|
||||
logger.info("Removed all duplicates from dataset successfully.")
|
||||
logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
|
||||
f"number of features {len(wo_duplicates.columns)}"))
|
||||
|
||||
logger.info(
|
||||
(
|
||||
'Number of duplicates over subset '
|
||||
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
|
||||
)
|
||||
)
|
||||
wo_duplicates = wo_duplicates.drop_duplicates(
|
||||
subset=duplicates_subset, ignore_index=True
|
||||
).copy()
|
||||
logger.info('Removed all duplicates from dataset successfully.')
|
||||
logger.info(
|
||||
(
|
||||
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
|
||||
f'number of features {len(wo_duplicates.columns)}'
|
||||
)
|
||||
)
|
||||
|
||||
return (wo_duplicates,)
|
||||
|
||||
|
||||
def remove_NA(
|
||||
data: DataFrame,
|
||||
target_features: list[str] = [
|
||||
@ -127,17 +142,18 @@ def remove_NA(
|
||||
DataFrame
|
||||
dataset with removed NA entries for given subset of features
|
||||
"""
|
||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||
logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
|
||||
|
||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||
logger.info(
|
||||
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
||||
)
|
||||
|
||||
return (wo_NA,)
|
||||
|
||||
|
||||
# ** (2) entry-based cleansing
|
||||
# following functions clean and prepare specific entries, not whole dataset
|
||||
def clean_string_slim(
|
||||
string: str
|
||||
) -> str:
|
||||
"""mapping function to clean single string entries in a series (feature-wise)
|
||||
def clean_string_slim(string: str) -> str:
|
||||
"""mapping function to clean single string entries in a series (feature-wise)
|
||||
of the dataset, used to be applied element-wise for string features
|
||||
|
||||
Parameters
|
||||
@ -151,13 +167,16 @@ def clean_string_slim(
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
pattern = r'[\t\n\r\f\v]'
|
||||
pattern = r'[\t\n\r\f\v]+'
|
||||
string = re.sub(pattern, ' ', string)
|
||||
pattern = r'([,;.:!?-_\+]){2,}'
|
||||
# remove whitespaces at the beginning and the end
|
||||
string = re.sub(pattern, r'\1', string)
|
||||
string = string.strip()
|
||||
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def entry_wise_cleansing(
|
||||
data: DataFrame,
|
||||
target_feature: str,
|
||||
@ -165,10 +184,16 @@ def entry_wise_cleansing(
|
||||
) -> tuple[DataFrame]:
|
||||
# apply given cleansing function to target feature
|
||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||
logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
|
||||
f"for feature >>{target_feature}<<"))
|
||||
logger.info(
|
||||
(
|
||||
f'Successfully applied entry-wise cleansing procedure '
|
||||
f'>>{cleansing_func.__name__}<< '
|
||||
f'for feature >>{target_feature}<<'
|
||||
)
|
||||
)
|
||||
return (data,)
|
||||
|
||||
|
||||
# ** in-depth analysis of one feature
|
||||
# following functions try to gain insights on a given feature of the IHM dataset such
|
||||
# as number of occurrences or associated Object IDs
|
||||
@ -178,15 +203,15 @@ def analyse_feature(
|
||||
) -> tuple[DataFrame]:
|
||||
# feature columns
|
||||
feature_entries = data[target_feature]
|
||||
logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
|
||||
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
|
||||
# obtain unique entries
|
||||
unique_feature_entries = feature_entries.unique()
|
||||
|
||||
|
||||
# prepare result DataFrame
|
||||
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
||||
result_df = pd.DataFrame(columns=cols)
|
||||
|
||||
for entry in tqdm(unique_feature_entries, mininterval=1.):
|
||||
|
||||
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
||||
len_entry = len(entry)
|
||||
filt = data[target_feature] == entry
|
||||
temp = data[filt]
|
||||
@ -194,19 +219,16 @@ def analyse_feature(
|
||||
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
|
||||
num_assoc_obj_ids = len(assoc_obj_ids)
|
||||
num_dupl = filt.sum()
|
||||
|
||||
conc_df = pd.DataFrame(data=[[
|
||||
entry,
|
||||
len_entry,
|
||||
num_dupl,
|
||||
assoc_obj_ids,
|
||||
num_assoc_obj_ids
|
||||
]], columns=cols)
|
||||
|
||||
|
||||
conc_df = pd.DataFrame(
|
||||
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
|
||||
columns=cols,
|
||||
)
|
||||
|
||||
result_df = pd.concat([result_df, conc_df], ignore_index=True)
|
||||
|
||||
|
||||
result_df = result_df.sort_values(by='num_occur', ascending=False).copy()
|
||||
|
||||
|
||||
return (result_df,)
|
||||
|
||||
|
||||
@ -223,16 +245,16 @@ def build_embedding_map(
|
||||
embeddings: dict[int, tuple[Embedding, str]] = {}
|
||||
is_spacy = False
|
||||
is_STRF = False
|
||||
|
||||
|
||||
if isinstance(model, GermanSpacyModel):
|
||||
is_spacy = True
|
||||
elif isinstance(model, SentenceTransformer):
|
||||
is_STRF = True
|
||||
|
||||
|
||||
if not any((is_spacy, is_STRF)):
|
||||
raise NotImplementedError("Model type unknown")
|
||||
|
||||
for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
|
||||
raise NotImplementedError('Model type unknown')
|
||||
|
||||
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
||||
# verbose code: Pyright not inferring types correctly
|
||||
idx = cast(int, idx)
|
||||
text = cast(str, text)
|
||||
@ -246,12 +268,17 @@ def build_embedding_map(
|
||||
logger.debug(f'{embd.text=} has no vector')
|
||||
elif is_STRF:
|
||||
model = cast(SentenceTransformer, model)
|
||||
embd = cast(Tensor,
|
||||
model.encode(text, show_progress_bar=False))
|
||||
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||
embeddings[idx] = (embd, text)
|
||||
|
||||
|
||||
return embeddings, (is_spacy, is_STRF)
|
||||
|
||||
|
||||
# adapt interface
|
||||
# use candidates by index function
|
||||
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
|
||||
|
||||
|
||||
# build similarity matrix out of embeddings
|
||||
def build_cosSim_matrix(
|
||||
data: Series,
|
||||
@ -259,30 +286,31 @@ def build_cosSim_matrix(
|
||||
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
||||
# build empty matrix
|
||||
df_index = data.index
|
||||
cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index,
|
||||
index=df_index, dtype=np.float32)
|
||||
|
||||
logger.info("Start building embedding map...")
|
||||
|
||||
cosineSim_idx_matrix = pd.DataFrame(
|
||||
data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
||||
)
|
||||
|
||||
logger.info('Start building embedding map...')
|
||||
|
||||
# obtain embeddings based on used model
|
||||
embds, (is_spacy, is_STRF) = build_embedding_map(
|
||||
data=data,
|
||||
model=model,
|
||||
)
|
||||
|
||||
logger.info("Embedding map built successfully.")
|
||||
|
||||
|
||||
logger.info('Embedding map built successfully.')
|
||||
|
||||
# apply index based mapping for efficient handling of large texts
|
||||
combs = combinations(df_index, 2)
|
||||
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index)-2)
|
||||
|
||||
logger.info("Start calculation of similarity scores...")
|
||||
|
||||
for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
|
||||
#print(f"{idx1=}, {idx2=}")
|
||||
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
||||
|
||||
logger.info('Start calculation of similarity scores...')
|
||||
|
||||
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
||||
# print(f"{idx1=}, {idx2=}")
|
||||
embd1 = embds[idx1][0]
|
||||
embd2 = embds[idx2][0]
|
||||
|
||||
|
||||
# calculate similarity based on model type
|
||||
if is_spacy:
|
||||
embd1 = cast(SpacyDoc, embds[idx1][0])
|
||||
@ -293,14 +321,15 @@ def build_cosSim_matrix(
|
||||
embd2 = cast(Tensor, embds[idx2][0])
|
||||
cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
|
||||
cosSim = cast(float, cosSim.item())
|
||||
|
||||
|
||||
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
||||
|
||||
logger.info("Similarity scores calculated successfully.")
|
||||
|
||||
|
||||
logger.info('Similarity scores calculated successfully.')
|
||||
|
||||
return cosineSim_idx_matrix, embds
|
||||
|
||||
# obtain index pairs with cosine similarity
|
||||
|
||||
# obtain index pairs with cosine similarity
|
||||
# greater than or equal to given threshold value
|
||||
def filt_thresh_cosSim_matrix(
|
||||
cosineSim_idx_matrix: DataFrame,
|
||||
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
|
||||
Series
|
||||
series with multi index (index pairs) and corresponding similarity score
|
||||
"""
|
||||
cosineSim_filt = cast(Series,
|
||||
cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
|
||||
|
||||
cosineSim_filt = cast(
|
||||
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
||||
)
|
||||
|
||||
return cosineSim_filt, embds
|
||||
|
||||
|
||||
def list_cosSim_dupl_candidates(
|
||||
cosineSim_filt: Series,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
@ -335,7 +366,7 @@ def list_cosSim_dupl_candidates(
|
||||
filename: str = 'CosSim-FilterCandidates',
|
||||
pipeline: BasePipeline | None = None,
|
||||
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
given threshold; more suitable for debugging purposes
|
||||
|
||||
Returns
|
||||
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
|
||||
list containing relevant index pairs for entries with similarity score greater than
|
||||
given threshold
|
||||
"""
|
||||
logger.info("Start gathering of similarity candidates...")
|
||||
logger.info('Start gathering of similarity candidates...')
|
||||
# compare found duplicates
|
||||
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [[
|
||||
idx1,
|
||||
embds[idx1][1],
|
||||
idx2,
|
||||
embds[idx2][1],
|
||||
score,
|
||||
]]
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
embds[idx1][1],
|
||||
idx2,
|
||||
embds[idx2][1],
|
||||
score,
|
||||
]
|
||||
]
|
||||
# add candidates to collection DataFrame
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
if df_candidates.empty:
|
||||
@ -370,25 +403,28 @@ def list_cosSim_dupl_candidates(
|
||||
df_candidates = pd.concat([df_candidates, df_conc])
|
||||
# save index pairs
|
||||
index_pairs.append((idx1, idx2))
|
||||
|
||||
logger.info("Similarity candidates gathered successfully.")
|
||||
|
||||
|
||||
logger.info('Similarity candidates gathered successfully.')
|
||||
|
||||
if save_candidates:
|
||||
if saving_path is None:
|
||||
raise ValueError(("Saving path must be provided if duplicate "
|
||||
"candidates should be saved."))
|
||||
raise ValueError(
|
||||
('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
||||
)
|
||||
elif pipeline is not None:
|
||||
target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_'
|
||||
+ filename + '.xlsx')
|
||||
target_filename = (
|
||||
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
||||
)
|
||||
elif pipeline is None:
|
||||
target_filename = f'{filename}.xlsx'
|
||||
logger.info("Saving similarity candidates...")
|
||||
logger.info('Saving similarity candidates...')
|
||||
target_path = saving_path.joinpath(target_filename)
|
||||
df_candidates.to_excel(target_path)
|
||||
logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
|
||||
|
||||
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
|
||||
|
||||
return index_pairs, embds
|
||||
|
||||
|
||||
# TODO: change implementation fully to SentenceTransformer
|
||||
# usage of batch processing for embeddings, use candidate idx function
|
||||
# from time analysis --> moved to ``helpers.py``
|
||||
@ -419,24 +455,32 @@ def similar_ids_groups(
|
||||
yield list(id_group)
|
||||
"""
|
||||
|
||||
|
||||
def merge_similarity_dupl(
|
||||
data: DataFrame,
|
||||
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info("Start merging of similarity candidates...")
|
||||
|
||||
logger.info('Start merging of similarity candidates...')
|
||||
|
||||
# data
|
||||
merged_data = data.copy()
|
||||
model_input = merged_data['entry']
|
||||
candidates_idx = candidates_by_index(
|
||||
data_model_input=model_input,
|
||||
model=model,
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
)
|
||||
# graph of similar ids
|
||||
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
|
||||
|
||||
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
||||
|
||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||
similar_id_group = list(similar_id_group)
|
||||
similar_data = merged_data.loc[similar_id_group,:]
|
||||
# keep first entry with max number occurrences, then number of
|
||||
similar_data = merged_data.loc[similar_id_group, :]
|
||||
# keep first entry with max number occurrences, then number of
|
||||
# associated objects, then length of entry
|
||||
similar_data = similar_data.sort_values(
|
||||
by=['num_occur', 'num_assoc_obj_ids', 'len'],
|
||||
by=['num_occur', 'num_assoc_obj_ids', 'len'],
|
||||
ascending=[False, False, False],
|
||||
)
|
||||
# merge information to first entry
|
||||
@ -453,11 +497,12 @@ def merge_similarity_dupl(
|
||||
# update entry in main dataset, drop remaining entries
|
||||
merged_data.update(merged_similar_data)
|
||||
merged_data = merged_data.drop(index=similar_id_group)
|
||||
|
||||
logger.info("Similarity candidates merged successfully.")
|
||||
|
||||
|
||||
logger.info('Similarity candidates merged successfully.')
|
||||
|
||||
return (merged_data.copy(),)
|
||||
|
||||
|
||||
# merge duplicates
|
||||
def merge_similarity_dupl_old(
|
||||
data: DataFrame,
|
||||
@ -466,15 +511,14 @@ def merge_similarity_dupl_old(
|
||||
# copy pre-cleaned data
|
||||
temp = data.copy()
|
||||
index = temp.index
|
||||
#logger.info("Start merging of similarity candidates...")
|
||||
|
||||
# logger.info("Start merging of similarity candidates...")
|
||||
|
||||
# iterate over index pairs
|
||||
for (i1, i2) in tqdm(dupl_idx_pairs):
|
||||
|
||||
for i1, i2 in tqdm(dupl_idx_pairs):
|
||||
# if an entry does not exist any more, skip this pair
|
||||
if i1 not in index or i2 not in index:
|
||||
continue
|
||||
|
||||
|
||||
# merge num occur
|
||||
num_occur1 = temp.at[i1, 'num_occur']
|
||||
num_occur2 = temp.at[i2, 'num_occur']
|
||||
@ -493,13 +537,13 @@ def merge_similarity_dupl_old(
|
||||
temp.at[i1, 'num_occur'] = new_num_occur
|
||||
temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
|
||||
temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
|
||||
|
||||
|
||||
# drop second entry
|
||||
temp = temp.drop(index=i2)
|
||||
index = temp.index
|
||||
|
||||
#logger.info("Similarity candidates merged successfully.")
|
||||
|
||||
|
||||
# logger.info("Similarity candidates merged successfully.")
|
||||
|
||||
return (temp,)
|
||||
|
||||
|
||||
@ -508,7 +552,7 @@ def choose_cosSim_dupl_candidates(
|
||||
cosineSim_filt: Series,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
"""providing an overview of candidates with a similarity score greater than
|
||||
given threshold, but decision is made manually by iterating through the candidates
|
||||
with user interaction; more suitable for debugging purposes
|
||||
|
||||
@ -520,15 +564,14 @@ def choose_cosSim_dupl_candidates(
|
||||
list containing relevant index pairs for entries with similarity score greater than
|
||||
given threshold
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# compare found duplicates
|
||||
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
|
||||
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
||||
# get texts for comparison
|
||||
text1 = embds[idx1][1]
|
||||
text2 = embds[idx2][1]
|
||||
@ -537,21 +580,23 @@ def choose_cosSim_dupl_candidates(
|
||||
print('text1:\n', text1, '\n', flush=True)
|
||||
print('text2:\n', text2, '\n', flush=True)
|
||||
decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
|
||||
|
||||
|
||||
if not decision == 'y':
|
||||
continue
|
||||
|
||||
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [[
|
||||
idx1,
|
||||
text1,
|
||||
idx2,
|
||||
text2,
|
||||
score,
|
||||
]]
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
text1,
|
||||
idx2,
|
||||
text2,
|
||||
score,
|
||||
]
|
||||
]
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
|
||||
|
||||
df_candidates = pd.concat([df_candidates, df_conc])
|
||||
index_pairs.append((idx1, idx2))
|
||||
|
||||
return df_candidates, index_pairs
|
||||
|
||||
return df_candidates, index_pairs
|
||||
|
||||
@ -1,11 +1,71 @@
|
||||
from typing import cast
|
||||
from collections.abc import Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from networkx import Graph
|
||||
from pandas import Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from torch import Tensor
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||
from lang_main.types import PandasIndex
|
||||
from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
|
||||
|
||||
def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float = 0.5,
|
||||
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
"""function to filter candidate indices based on cosine similarity
|
||||
using SentenceTransformer model in batch mode,
|
||||
feed data as Series to retain information about indices of entries and
|
||||
access them later in the original dataset
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj_id : ObjectID
|
||||
_description_
|
||||
data_model_input : Series
|
||||
containing indices and text entries to process
|
||||
model : SentenceTransformer
|
||||
necessary SentenceTransformer model to encode text entries
|
||||
cos_sim_threshold : float, optional
|
||||
threshold for cosine similarity to filter candidates, by default 0.5
|
||||
|
||||
Yields
|
||||
------
|
||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
||||
ObjectID and tuple of index pairs which meet the cosine
|
||||
similarity threshold
|
||||
"""
|
||||
# embeddings
|
||||
batch = cast(list[str], data_model_input.to_list())
|
||||
embds = cast(
|
||||
Tensor,
|
||||
model.encode(
|
||||
batch,
|
||||
convert_to_numpy=False,
|
||||
convert_to_tensor=True,
|
||||
show_progress_bar=False,
|
||||
),
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||
np.fill_diagonal(cos_sim, 0.0)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
for idx_array in cos_sim_idx:
|
||||
idx_pair = cast(
|
||||
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
|
||||
)
|
||||
yield idx_pair
|
||||
|
||||
|
||||
def similar_index_connection_graph(
|
||||
@ -15,21 +75,21 @@ def similar_index_connection_graph(
|
||||
# use this graph to get connected components (indices which belong together)
|
||||
# retain semantic connection on whole dataset
|
||||
similar_id_graph = nx.Graph()
|
||||
for (idx1, idx2) in similar_idx_pairs:
|
||||
# inplace operation, parent/child do not really exist in undirected graph
|
||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
|
||||
# for idx1, idx2 in similar_idx_pairs:
|
||||
# # inplace operation, parent/child do not really exist in undirected graph
|
||||
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
|
||||
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
||||
|
||||
|
||||
return similar_id_graph, graph_info
|
||||
|
||||
# TODO check returning tuple
|
||||
|
||||
def similar_index_groups(
|
||||
similar_id_graph: Graph,
|
||||
) -> Iterator[tuple[PandasIndex, ...]]:
|
||||
# groups of connected indices
|
||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
nx.connected_components(G=similar_id_graph))
|
||||
|
||||
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
|
||||
|
||||
for id_group in ids_groups:
|
||||
yield tuple(id_group)
|
||||
yield tuple(id_group)
|
||||
|
||||
@ -1,21 +1,17 @@
|
||||
from typing import cast
|
||||
from collections.abc import Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from pandas import DataFrame, Series
|
||||
from torch import Tensor
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from tqdm.auto import tqdm # TODO: check deletion
|
||||
from tqdm.auto import tqdm # TODO: check deletion
|
||||
|
||||
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.analysis.shared import (
|
||||
candidates_by_index,
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
|
||||
|
||||
|
||||
def non_relevant_obj_ids(
|
||||
@ -25,35 +21,36 @@ def non_relevant_obj_ids(
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> tuple[ObjectID, ...]:
|
||||
|
||||
data = data.copy()
|
||||
ids_to_ignore: set[ObjectID] = set()
|
||||
obj_ids = cast(Iterable[ObjectID], # actually NumPy array
|
||||
data[feature_obj_id].unique())
|
||||
obj_ids = cast(
|
||||
Iterable[ObjectID], # actually NumPy array
|
||||
data[feature_obj_id].unique(),
|
||||
)
|
||||
|
||||
for obj_id in obj_ids:
|
||||
feats_per_obj_id = cast(
|
||||
Series,
|
||||
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
|
||||
Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
|
||||
)
|
||||
# check for uniqueness of given feature for current ObjectID
|
||||
# ignore NaN values
|
||||
feats_per_obj_id = feats_per_obj_id.dropna()
|
||||
unique_feats_per_obj_id = len(feats_per_obj_id.unique())
|
||||
|
||||
|
||||
if unique_feats_per_obj_id > thresh_unique_feat_per_id:
|
||||
ids_to_ignore.add(obj_id)
|
||||
|
||||
|
||||
return tuple(ids_to_ignore)
|
||||
|
||||
|
||||
def remove_non_relevant_obj_ids(
|
||||
data: DataFrame,
|
||||
thresh_unique_feat_per_id: int,
|
||||
*,
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> DataFrame:
|
||||
logger.info("Removing non-relevant ObjectIDs from dataset")
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Removing non-relevant ObjectIDs from dataset')
|
||||
data = data.copy()
|
||||
ids_to_ignore = non_relevant_obj_ids(
|
||||
data=data,
|
||||
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
|
||||
)
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
|
||||
logger.info("Non-relevant ObjectIDs removed successfully")
|
||||
|
||||
return data
|
||||
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
|
||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||
|
||||
return (data,)
|
||||
|
||||
def filter_activities_per_obj_id(
|
||||
data: DataFrame,
|
||||
activity_feature: str = 'VorgangsTypName',
|
||||
relevant_activity_types: Iterable[str] = (
|
||||
'Reparaturauftrag (Portal)',
|
||||
),
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
threshold_num_activities: int = 1,
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
logger.info("Filtering activities per ObjectID")
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series,
|
||||
data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
|
||||
# index of series contains ObjectIDs
|
||||
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
||||
filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
|
||||
.isin(obj_ids_below_thresh))
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info("Activities per ObjectID filtered successfully")
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
def generate_model_input(
|
||||
data: DataFrame,
|
||||
@ -107,8 +74,8 @@ def generate_model_input(
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
),
|
||||
) -> DataFrame:
|
||||
logger.info("Generating concatenation of model input features")
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Generating concatenation of model input features')
|
||||
data = data.copy()
|
||||
model_input_features = list(model_input_features)
|
||||
input_features = data[model_input_features].fillna('').astype(str)
|
||||
@ -116,9 +83,40 @@ def generate_model_input(
|
||||
lambda x: ' - '.join(x),
|
||||
axis=1,
|
||||
)
|
||||
logger.info("Model input generated successfully")
|
||||
|
||||
return data
|
||||
logger.info('Model input generated successfully')
|
||||
|
||||
return (data,)
|
||||
|
||||
|
||||
def filter_activities_per_obj_id(
|
||||
data: DataFrame,
|
||||
activity_feature: str = 'VorgangsTypName',
|
||||
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
threshold_num_activities: int = 1,
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
logger.info('Filtering activities per ObjectID')
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
|
||||
# index of series contains ObjectIDs
|
||||
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
||||
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
|
||||
obj_ids_below_thresh
|
||||
)
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info('Activities per ObjectID filtered successfully')
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
|
||||
# for each obj_id in relevant_obj_ids
|
||||
## filter data for obj_id
|
||||
@ -130,6 +128,7 @@ def generate_model_input(
|
||||
## obtain idx pairs, yield
|
||||
## use idx pairs to get idx values of series
|
||||
|
||||
|
||||
def get_timeline_candidates_index(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
@ -140,92 +139,33 @@ def get_timeline_candidates_index(
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
||||
# already sorted ObjIDs (descending regarding number of activities)
|
||||
obj_ids = cast(Iterable[ObjectID],
|
||||
num_activities_per_obj_id.index)
|
||||
|
||||
obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
|
||||
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj_id = cast(
|
||||
DataFrame,
|
||||
data.loc[data[feature_obj_id]==obj_id]
|
||||
)
|
||||
data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
|
||||
data_model_input = data_per_obj_id[model_input_feature]
|
||||
|
||||
|
||||
candidates_idx = candidates_by_index(
|
||||
data_model_input=data_model_input,
|
||||
model=model,
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
)
|
||||
# directly process candidates
|
||||
candidates_idx = tuple(candidates_idx)
|
||||
# candidates_idx = tuple(candidates_idx)
|
||||
similar_id_graph, _ = similar_index_connection_graph(
|
||||
similar_idx_pairs=candidates_idx,
|
||||
)
|
||||
|
||||
|
||||
for index_group in similar_index_groups(similar_id_graph):
|
||||
yield obj_id, index_group
|
||||
|
||||
|
||||
# TODO: check application for duplicate removal
|
||||
def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float = 0.5,
|
||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
"""function to filter candidate indices based on cosine similarity
|
||||
using SentenceTransformer model in batch mode,
|
||||
feed data as Series to retain information about indices of entries and
|
||||
access them later in the original dataset
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj_id : ObjectID
|
||||
_description_
|
||||
data_model_input : Series
|
||||
containing indices and text entries to process
|
||||
model : SentenceTransformer
|
||||
necessary SentenceTransformer model to encode text entries
|
||||
cos_sim_threshold : float, optional
|
||||
threshold for cosine similarity to filter candidates, by default 0.5
|
||||
|
||||
Yields
|
||||
------
|
||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
||||
ObjectID and tuple of index pairs which meet the cosine
|
||||
similarity threshold
|
||||
"""
|
||||
# embeddings
|
||||
batch = cast(list[str],
|
||||
data_model_input.to_list())
|
||||
embds = cast(
|
||||
Tensor,
|
||||
model.encode(
|
||||
batch,
|
||||
convert_to_numpy=False,
|
||||
convert_to_tensor=True,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(
|
||||
npt.NDArray,
|
||||
sentence_transformers.util.cos_sim(embds, embds).numpy()
|
||||
)
|
||||
np.fill_diagonal(cos_sim, 0.)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
for idx_array in cos_sim_idx:
|
||||
idx_pair = cast(
|
||||
tuple[np.int64, np.int64],
|
||||
tuple(data_model_input.index[idx] for idx in idx_array)
|
||||
)
|
||||
yield idx_pair
|
||||
|
||||
|
||||
def transform_timeline_candidates(
|
||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||
) -> TimelineCandidates:
|
||||
"""function to build a mapping of ObjectIDs to their respective collection of
|
||||
timeline candidates (as tuple), each candidate group is separated as distinct
|
||||
timeline candidates (as tuple), each candidate group is separated as distinct
|
||||
tuple within this outer tuple
|
||||
|
||||
Parameters
|
||||
@ -238,12 +178,12 @@ def transform_timeline_candidates(
|
||||
dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
dictionary: ObjectID -> tuple of candidate groups
|
||||
"""
|
||||
|
||||
|
||||
candidates_by_obj_id: TimelineCandidates = {}
|
||||
|
||||
|
||||
obj_id_target: ObjectID | None = None
|
||||
collection: list[tuple[PandasIndex, ...]] = []
|
||||
|
||||
|
||||
for obj_id, cands in candidates:
|
||||
if obj_id_target is None:
|
||||
collection = []
|
||||
@ -253,26 +193,58 @@ def transform_timeline_candidates(
|
||||
collection = []
|
||||
obj_id_target = obj_id
|
||||
collection.append(cands)
|
||||
|
||||
|
||||
if collection and obj_id_target is not None:
|
||||
candidates_by_obj_id[obj_id_target] = tuple(collection)
|
||||
|
||||
|
||||
return candidates_by_obj_id
|
||||
|
||||
def map_obj_texts(
|
||||
|
||||
def map_obj_id_to_texts(
|
||||
data: DataFrame,
|
||||
obj_ids: Iterable[ObjectID],
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> dict[ObjectID, str]:
|
||||
data = data.copy()
|
||||
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
|
||||
|
||||
obj_id_to_text: dict[ObjectID, str] = {}
|
||||
|
||||
for obj_id in obj_ids:
|
||||
data_per_obj = cast(
|
||||
DataFrame,
|
||||
data.loc[data['ObjektID']==obj_id]
|
||||
)
|
||||
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
|
||||
# just take first entry
|
||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||
obj_text = obj_text.strip(r' ,.:')
|
||||
obj_id_to_text[obj_id] = obj_text
|
||||
|
||||
return obj_id_to_text
|
||||
|
||||
return obj_id_to_text
|
||||
|
||||
|
||||
def get_timeline_candidates(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
*,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
logger.info('Obtaining timeline candidates...')
|
||||
candidates = get_timeline_candidates_index(
|
||||
data=data,
|
||||
num_activities_per_obj_id=num_activities_per_obj_id,
|
||||
model=model,
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
feature_obj_id=feature_obj_id,
|
||||
model_input_feature=model_input_feature,
|
||||
)
|
||||
tl_candidates = transform_timeline_candidates(candidates)
|
||||
logger.info('Timeline candidates obtained successfully.')
|
||||
# text mapping to obtain object descriptors
|
||||
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
||||
map_obj_text = map_obj_id_to_texts(
|
||||
data=data,
|
||||
feature_obj_id=feature_obj_id,
|
||||
)
|
||||
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||
|
||||
return tl_candidates, map_obj_text
|
||||
|
||||
@ -1,56 +1,56 @@
|
||||
from typing import cast
|
||||
import re
|
||||
from itertools import combinations
|
||||
from collections.abc import Iterator
|
||||
from itertools import combinations
|
||||
from typing import cast
|
||||
|
||||
from dateutil.parser import parse
|
||||
from spacy.tokens.token import Token as SpacyToken
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from pandas import DataFrame
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from spacy.tokens.token import Token as SpacyToken
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
from lang_main.analysis.graphs import (
|
||||
update_graph,
|
||||
TokenGraph,
|
||||
update_graph,
|
||||
)
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
|
||||
# ** Logging
|
||||
#LOGGING_LEVEL = 'INFO'
|
||||
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
#logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
# LOGGING_LEVEL = 'INFO'
|
||||
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
# logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
|
||||
# ** POS
|
||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||
|
||||
#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||
|
||||
# ** TAG
|
||||
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||
|
||||
|
||||
# ** obtaining connection in texts
|
||||
|
||||
|
||||
def pre_clean_word(string: str) -> str:
|
||||
|
||||
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
||||
string = re.sub(pattern, '', string)
|
||||
|
||||
|
||||
return string
|
||||
|
||||
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
||||
|
||||
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
||||
def is_str_date(
|
||||
string: str,
|
||||
string: str,
|
||||
fuzzy: bool = False,
|
||||
) -> bool:
|
||||
#print(string)
|
||||
# print(string)
|
||||
try:
|
||||
# check if string is a number
|
||||
# if length is greater than 8, it is not a date
|
||||
@ -60,33 +60,38 @@ def is_str_date(
|
||||
except ValueError:
|
||||
# not a number
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
parse(string, fuzzy=fuzzy)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def obtain_relevant_descendants(
|
||||
token: SpacyToken,
|
||||
token: SpacyToken,
|
||||
) -> Iterator[SpacyToken]:
|
||||
|
||||
for descendant in token.subtree:
|
||||
# subtrees contain the token itself
|
||||
# if current element is token skip this element
|
||||
if descendant == token:
|
||||
continue
|
||||
|
||||
|
||||
# if descendant is a date skip it)
|
||||
if is_str_date(string=descendant.text):
|
||||
continue
|
||||
|
||||
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
|
||||
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
|
||||
|
||||
|
||||
logger.debug(
|
||||
(
|
||||
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
|
||||
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
|
||||
)
|
||||
)
|
||||
|
||||
# eliminate cases of cross-references with verbs
|
||||
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
|
||||
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
|
||||
if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
|
||||
descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
|
||||
):
|
||||
continue
|
||||
# skip cases in which descendant is indirect POS with others than verbs
|
||||
elif descendant.pos_ in POS_INDIRECT:
|
||||
@ -94,11 +99,12 @@ def obtain_relevant_descendants(
|
||||
# skip cases in which child has no relevant POS or TAG
|
||||
elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
|
||||
continue
|
||||
|
||||
|
||||
yield descendant
|
||||
|
||||
|
||||
# TODO look at results and fine-tune function accordingly
|
||||
|
||||
|
||||
def add_doc_info_to_graph(
|
||||
graph: TokenGraph,
|
||||
doc: SpacyDoc,
|
||||
@ -114,7 +120,7 @@ def add_doc_info_to_graph(
|
||||
# skip token which are dates or times
|
||||
if is_str_date(string=token.text):
|
||||
continue
|
||||
|
||||
|
||||
relevant_descendants = obtain_relevant_descendants(token=token)
|
||||
# for non-AUX: add parent <--> descendant pair to graph
|
||||
if token.pos_ not in POS_INDIRECT:
|
||||
@ -124,13 +130,13 @@ def add_doc_info_to_graph(
|
||||
graph=graph,
|
||||
parent=token.lemma_,
|
||||
child=descendant.lemma_,
|
||||
weight_connection=weight
|
||||
weight_connection=weight,
|
||||
)
|
||||
else:
|
||||
# if indirect POS, make connection between all associated words
|
||||
combs = combinations(relevant_descendants, r=2)
|
||||
for comb in combs:
|
||||
# !! parents and children do not really exist in this case,
|
||||
# !! parents and children do not really exist in this case,
|
||||
# !! but only one connection is made
|
||||
update_graph(
|
||||
graph=graph,
|
||||
@ -139,32 +145,33 @@ def add_doc_info_to_graph(
|
||||
weight_connection=weight,
|
||||
)
|
||||
|
||||
|
||||
def build_token_graph(
|
||||
data: DataFrame,
|
||||
model: GermanSpacyModel,
|
||||
) -> tuple[TokenGraph]:
|
||||
# empty NetworkX directed graph
|
||||
#graph = nx.DiGraph()
|
||||
# graph = nx.DiGraph()
|
||||
graph = TokenGraph()
|
||||
|
||||
|
||||
for row in tqdm(data.itertuples(), total=len(data)):
|
||||
# obtain properties from tuple
|
||||
# attribute names must match with preprocessed data
|
||||
entry_text = cast(str, row.entry)
|
||||
weight = cast(int, row.num_occur)
|
||||
|
||||
|
||||
# get spacy model output
|
||||
doc = model(entry_text)
|
||||
|
||||
|
||||
add_doc_info_to_graph(
|
||||
graph=graph,
|
||||
doc=doc,
|
||||
weight=weight,
|
||||
)
|
||||
|
||||
|
||||
# metadata
|
||||
graph.update_metadata()
|
||||
# convert to undirected
|
||||
graph.to_undirected()
|
||||
|
||||
return (graph,)
|
||||
|
||||
return (graph,)
|
||||
|
||||
55
src/lang_main/constants.py
Normal file
55
src/lang_main/constants.py
Normal file
@ -0,0 +1,55 @@
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from lang_main import CONFIG
|
||||
|
||||
# ** paths
|
||||
INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
# ** control
|
||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
||||
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
||||
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||
# ** export
|
||||
|
||||
# ** preprocessing
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
|
||||
'filename_cossim_filter_candidates'
|
||||
]
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
|
||||
'threshold_amount_characters'
|
||||
]
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
# ** token analysis
|
||||
|
||||
# ** graph postprocessing
|
||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
# ** time analysis.uniqueness
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||
'threshold_unique_texts'
|
||||
]
|
||||
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||
'criterion_feature'
|
||||
]
|
||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||
# ** time_analysis.model_input
|
||||
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
|
||||
CONFIG['time_analysis']['model_input']['input_features']
|
||||
)
|
||||
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
||||
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
|
||||
CONFIG['time_analysis']['model_input']['activity_types']
|
||||
)
|
||||
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
|
||||
'threshold_num_acitivities'
|
||||
]
|
||||
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
|
||||
'threshold_similarity'
|
||||
]
|
||||
56
src/lang_main/lang_main_config.toml
Normal file
56
src/lang_main/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
@ -1,5 +1,5 @@
|
||||
from typing import Final
|
||||
import logging
|
||||
from typing import Final
|
||||
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
from typing import Any
|
||||
#from types import FunctionType
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
"""Error describing that no action is available in the current pipeline"""
|
||||
|
||||
class BasePipeline():
|
||||
|
||||
|
||||
class BasePipeline:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
@ -22,12 +20,14 @@ class BasePipeline():
|
||||
) -> None:
|
||||
# init base class
|
||||
super().__init__()
|
||||
|
||||
|
||||
# name of pipeline
|
||||
self.name = name
|
||||
# working directory for pipeline == output path
|
||||
self.working_dir = working_dir
|
||||
|
||||
# if not self.working_dir.exists():
|
||||
# self.working_dir.mkdir(parents=True)
|
||||
|
||||
# container for actions to perform during pass
|
||||
self.actions: list[Callable] = []
|
||||
self.action_names: list[str] = []
|
||||
@ -37,15 +37,17 @@ class BasePipeline():
|
||||
self.curr_proc_idx: int = 1
|
||||
# intermediate result
|
||||
self._intermediate_result: Any | None = None
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (f"{self.__class__.__name__}(name: {self.name}, "
|
||||
f"working dir: {self.working_dir}, contents: {self.action_names})")
|
||||
|
||||
return (
|
||||
f'{self.__class__.__name__}(name: {self.name}, '
|
||||
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||
)
|
||||
|
||||
@property
|
||||
def intermediate_result(self) -> Any:
|
||||
return self._intermediate_result
|
||||
|
||||
|
||||
def add(
|
||||
self,
|
||||
action: Callable,
|
||||
@ -53,16 +55,17 @@ class BasePipeline():
|
||||
save_result: bool = False,
|
||||
) -> None:
|
||||
# check explicitly for function type
|
||||
#if isinstance(action, FunctionType):
|
||||
# if isinstance(action, FunctionType):
|
||||
if isinstance(action, Callable):
|
||||
self.actions.append(action)
|
||||
self.action_names.append(action.__name__)
|
||||
self.actions_kwargs.append(action_kwargs.copy())
|
||||
self.is_save_result.append(save_result)
|
||||
else:
|
||||
raise TypeError(("Action must be custom function, "
|
||||
f"but is of type >>{type(action)}<<."))
|
||||
|
||||
raise TypeError(
|
||||
f'Action must be custom function, but is of type >>{type(action)}<<.'
|
||||
)
|
||||
|
||||
# TODO: add multiple entries by utilising simple add method
|
||||
"""
|
||||
def add_multi(
|
||||
@ -84,7 +87,7 @@ class BasePipeline():
|
||||
raise TypeError(("Action must be function or sequence of functions, "
|
||||
f"but is of type >>{type(action)}<<."))
|
||||
"""
|
||||
|
||||
|
||||
def save_curr_result(
|
||||
self,
|
||||
filename: str,
|
||||
@ -94,7 +97,7 @@ class BasePipeline():
|
||||
target_path = target_path.with_suffix('.pkl')
|
||||
# saving file locally
|
||||
save_pickle(obj=self._intermediate_result, path=target_path)
|
||||
|
||||
|
||||
def load_intermediate_result(
|
||||
self,
|
||||
saving_path: str,
|
||||
@ -103,25 +106,26 @@ class BasePipeline():
|
||||
target_path = Path(saving_path + filename).with_suffix('.pkl')
|
||||
# loading DataFrame or Series from pickle
|
||||
data = load_pickle(target_path)
|
||||
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def prep_run(self) -> None:
|
||||
logger.info(f"Starting processing pipeline >>{self.name}<<...")
|
||||
logger.info(f'Starting processing pipeline >>{self.name}<<...')
|
||||
# progress tracking
|
||||
self.curr_proc_idx = 1
|
||||
# check if performable actions available
|
||||
if len(self.actions) == 0:
|
||||
raise NoPerformableActionError(("The pipeline does not contain any "
|
||||
"performable actions."))
|
||||
|
||||
raise NoPerformableActionError(
|
||||
('The pipeline does not contain any ' 'performable actions.')
|
||||
)
|
||||
|
||||
def run(
|
||||
self,
|
||||
starting_values: tuple[Any, ...],
|
||||
) -> tuple[Any, ...]:
|
||||
# prepare start
|
||||
self.prep_run()
|
||||
|
||||
|
||||
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
|
||||
if idx == 0:
|
||||
ret = action(*starting_values, **action_kwargs)
|
||||
@ -134,7 +138,7 @@ class BasePipeline():
|
||||
self.save_curr_result(filename=self.action_names[idx])
|
||||
# processing tracking
|
||||
self.curr_proc_idx += 1
|
||||
|
||||
logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
|
||||
|
||||
return ret
|
||||
|
||||
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
|
||||
|
||||
return ret
|
||||
|
||||
@ -1,57 +1,144 @@
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import spacy
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from lang_main import (
|
||||
SAVE_PATH_FOLDER,
|
||||
DATE_COLS,
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
THRESHOLD_SIMILARITY,
|
||||
)
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.preprocessing import (
|
||||
load_raw_data,
|
||||
remove_duplicates,
|
||||
remove_NA,
|
||||
analyse_feature,
|
||||
clean_string_slim,
|
||||
entry_wise_cleansing,
|
||||
analyse_feature,
|
||||
build_cosSim_matrix,
|
||||
filt_thresh_cosSim_matrix,
|
||||
list_cosSim_dupl_candidates,
|
||||
load_raw_data,
|
||||
merge_similarity_dupl,
|
||||
remove_duplicates,
|
||||
remove_NA,
|
||||
)
|
||||
from lang_main.analysis.timeline import (
|
||||
filter_activities_per_obj_id,
|
||||
generate_model_input,
|
||||
get_timeline_candidates,
|
||||
remove_non_relevant_obj_ids,
|
||||
)
|
||||
from lang_main.analysis.tokens import build_token_graph
|
||||
from lang_main.constants import (
|
||||
ACTIVITY_FEATURE,
|
||||
ACTIVITY_TYPES,
|
||||
DATE_COLS,
|
||||
FEATURE_NAME_OBJ_ID,
|
||||
MODEL_INPUT_FEATURES,
|
||||
SAVE_PATH_FOLDER,
|
||||
THRESHOLD_NUM_ACTIVITIES,
|
||||
THRESHOLD_SIMILARITY,
|
||||
THRESHOLD_TIMELINE_SIMILARITY,
|
||||
THRESHOLD_UNIQUE_TEXTS,
|
||||
UNIQUE_CRITERION_FEATURE,
|
||||
)
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
|
||||
# ** pipeline configuration
|
||||
# ** target feature preparation
|
||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
|
||||
pipe_target_feat.add(
|
||||
load_raw_data,
|
||||
{
|
||||
'date_cols': DATE_COLS,
|
||||
},
|
||||
)
|
||||
pipe_target_feat.add(remove_duplicates)
|
||||
pipe_target_feat.add(remove_NA, save_result=True)
|
||||
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
|
||||
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
|
||||
pipe_target_feat.add(
|
||||
entry_wise_cleansing,
|
||||
{
|
||||
'target_feature': 'VorgangsBeschreibung',
|
||||
'cleansing_func': clean_string_slim,
|
||||
},
|
||||
)
|
||||
pipe_target_feat.add(
|
||||
analyse_feature,
|
||||
{
|
||||
'target_feature': 'VorgangsBeschreibung',
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
# output: DataFrame containing target feature with
|
||||
# number of occurrences and associated ObjectIDs
|
||||
|
||||
# ** embedding pipe
|
||||
# ?? still needed?
|
||||
# using similarity between entries to catch duplicates with typo or similar content
|
||||
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||
model_spacy = spacy.load('de_dep_news_trf')
|
||||
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
||||
|
||||
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
|
||||
pipe_embds.add(
|
||||
list_cosSim_dupl_candidates,
|
||||
{'save_candidates': True,
|
||||
'saving_path': SAVE_PATH_FOLDER,
|
||||
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
'pipeline': pipe_embds}, save_result=True)
|
||||
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||
# pipe_embds.add(
|
||||
# filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
|
||||
# )
|
||||
# pipe_embds.add(
|
||||
# list_cosSim_dupl_candidates,
|
||||
# {
|
||||
# 'save_candidates': True,
|
||||
# 'saving_path': SAVE_PATH_FOLDER,
|
||||
# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
# 'pipeline': pipe_embds,
|
||||
# },
|
||||
# save_result=True,
|
||||
# )
|
||||
|
||||
# ** Merge duplicates
|
||||
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||
# pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||
pipe_merge.add(
|
||||
merge_similarity_dupl,
|
||||
{
|
||||
'model': model_stfr,
|
||||
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
# ** token analysis
|
||||
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
|
||||
pipe_token_analysis.add(
|
||||
build_token_graph,
|
||||
{
|
||||
'model': model_spacy,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
|
||||
# ** timeline analysis
|
||||
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_timeline.add(
|
||||
remove_non_relevant_obj_ids,
|
||||
{
|
||||
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
|
||||
'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
pipe_timeline.add(
|
||||
generate_model_input,
|
||||
{
|
||||
'target_feature_name': 'nlp_model_input',
|
||||
'model_input_features': MODEL_INPUT_FEATURES,
|
||||
},
|
||||
)
|
||||
pipe_timeline.add(
|
||||
filter_activities_per_obj_id,
|
||||
{
|
||||
'activity_feature': ACTIVITY_FEATURE,
|
||||
'relevant_activity_types': ACTIVITY_TYPES,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
|
||||
},
|
||||
)
|
||||
pipe_timeline.add(
|
||||
get_timeline_candidates,
|
||||
{
|
||||
'model': model_stfr,
|
||||
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
'model_input_feature': 'nlp_model_input',
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
@ -1,56 +1,67 @@
|
||||
from typing import Any
|
||||
import os
|
||||
import shutil
|
||||
import pickle
|
||||
import shutil
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lang_main.loggers import logger_shared_helpers as logger
|
||||
|
||||
|
||||
# ** Lib
|
||||
def create_saving_folder(
|
||||
saving_path_folder: str | Path,
|
||||
overwrite_existing: bool = False,
|
||||
) -> None:
|
||||
# check for existence of given path
|
||||
if not os.path.exists(saving_path_folder):
|
||||
os.makedirs(saving_path_folder)
|
||||
if isinstance(saving_path_folder, str):
|
||||
saving_path_folder = Path(saving_path_folder)
|
||||
if not saving_path_folder.exists():
|
||||
saving_path_folder.mkdir(parents=True)
|
||||
else:
|
||||
if overwrite_existing:
|
||||
# overwrite if desired (deletes whole path and re-creates it)
|
||||
shutil.rmtree(saving_path_folder)
|
||||
os.makedirs(saving_path_folder)
|
||||
else:
|
||||
logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
|
||||
"unchanged. If you want to overwrite this path, use parameter "
|
||||
">>overwrite_existing<<."))
|
||||
logger.info(
|
||||
(
|
||||
f'Path >>{saving_path_folder}<< already exists and remained '
|
||||
f'unchanged. If you want to overwrite this path, use parameter '
|
||||
f'>>overwrite_existing<<.'
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def load_toml_config(
|
||||
path_to_toml: str | Path,
|
||||
) -> dict[str, Any]:
|
||||
with open(path_to_toml, "rb") as f:
|
||||
with open(path_to_toml, 'rb') as f:
|
||||
data = tomllib.load(f)
|
||||
logger.info("Loaded TOML config file successfully.")
|
||||
logger.info('Loaded TOML config file successfully.')
|
||||
return data
|
||||
|
||||
|
||||
# saving and loading using pickle
|
||||
# careful: pickling from unknown sources can be dangerous
|
||||
def save_pickle(
|
||||
obj: Any,
|
||||
obj: Any,
|
||||
path: str | Path,
|
||||
) -> None:
|
||||
with open(path, 'wb') as file:
|
||||
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(f"Saved file successfully under {path}")
|
||||
logger.info(f'Saved file successfully under {path}')
|
||||
|
||||
|
||||
def load_pickle(
|
||||
path: str | Path,
|
||||
) -> Any:
|
||||
with open(path, 'rb') as file:
|
||||
obj = pickle.load(file)
|
||||
logger.info("Loaded file successfully.")
|
||||
logger.info('Loaded file successfully.')
|
||||
return obj
|
||||
|
||||
|
||||
# TODO: remove, too specialised for common application
|
||||
"""
|
||||
def filter_candidates_idx(
|
||||
@ -103,4 +114,4 @@ def filter_candidates_idx(
|
||||
tuple(data_model_input.index[idx] for idx in idx_array)
|
||||
)
|
||||
yield idx_pair
|
||||
"""
|
||||
"""
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import TypeAlias, Literal
|
||||
from typing import Literal, TypeAlias
|
||||
|
||||
import numpy as np
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
@ -6,7 +6,7 @@ from torch import Tensor
|
||||
|
||||
LoggingLevels: TypeAlias = Literal[
|
||||
'DEBUG',
|
||||
'INFO',
|
||||
'INFO',
|
||||
'WARNING',
|
||||
'ERROR',
|
||||
'CRITICAL',
|
||||
@ -16,4 +16,4 @@ PandasIndex: TypeAlias = int | np.int64
|
||||
ObjectID: TypeAlias = int
|
||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||
|
||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
|
||||
@ -13,29 +13,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'ihm_analyse'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from ihm_analyse import CONFIG\n",
|
||||
"from ihm_analyse.lib.preprocess import (\n",
|
||||
"from lang_main import CONFIG\n",
|
||||
"from lang_main.lib.preprocess import (\n",
|
||||
" load_raw_data,\n",
|
||||
" remove_duplicates,\n",
|
||||
" remove_NA,\n",
|
||||
@ -47,8 +43,8 @@
|
||||
" list_cosSim_dupl_candidates,\n",
|
||||
" merge_similarity_dupl,\n",
|
||||
")\n",
|
||||
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||||
"from ihm_analyse.lib.helpers import (\n",
|
||||
"from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||||
"from lang_main.lib.helpers import (\n",
|
||||
" save_pickle, \n",
|
||||
" load_pickle, \n",
|
||||
" create_saving_folder,\n",
|
||||
|
||||
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
Binary file not shown.
Binary file not shown.
@ -1,28 +1,42 @@
|
||||
from typing import cast
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from dash import (
|
||||
Dash,
|
||||
html,
|
||||
dcc,
|
||||
callback,
|
||||
Output,
|
||||
Input,
|
||||
Output,
|
||||
State,
|
||||
callback,
|
||||
dash_table,
|
||||
dcc,
|
||||
html,
|
||||
)
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
from lang_main import load_pickle
|
||||
from lang_main.types import ObjectID, TimelineCandidates
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main import load_pickle
|
||||
from lang_main.types import TimelineCandidates, ObjectID
|
||||
|
||||
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||
|
||||
# ** data
|
||||
data = cast(DataFrame, load_pickle('./data.pkl'))
|
||||
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
|
||||
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
|
||||
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||
p_tl = Path(
|
||||
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||
)
|
||||
ret = cast(DataFrame, load_pickle(p_df))
|
||||
data = ret[0]
|
||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||
cands = ret[0]
|
||||
texts = ret[1]
|
||||
|
||||
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
|
||||
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
|
||||
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
|
||||
# data = cast(DataFrame, load_pickle(p_df))
|
||||
# cands = cast(TimelineCandidates, load_pickle(p_cands))
|
||||
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
|
||||
|
||||
table_feats = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
@ -52,25 +66,28 @@ hover_data = {
|
||||
app = Dash(prevent_initial_callbacks=True)
|
||||
|
||||
app.layout = [
|
||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
|
||||
html.Div(children=[
|
||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||
dcc.Dropdown(
|
||||
list(cands.keys()),
|
||||
id='dropdown-selection',
|
||||
placeholder="ObjektID auswählen...",
|
||||
)
|
||||
]),
|
||||
html.Div(children=[
|
||||
html.H3(id='object_text'),
|
||||
dcc.Dropdown(id='choice-candidates'),
|
||||
dcc.Graph(id='graph-output'),
|
||||
]),
|
||||
html.Div(children=[
|
||||
dash_table.DataTable(id='table-candidates')
|
||||
]),
|
||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
|
||||
html.Div(
|
||||
children=[
|
||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||
dcc.Dropdown(
|
||||
list(cands.keys()),
|
||||
id='dropdown-selection',
|
||||
placeholder='ObjektID auswählen...',
|
||||
),
|
||||
]
|
||||
),
|
||||
html.Div(
|
||||
children=[
|
||||
html.H3(id='object_text'),
|
||||
dcc.Dropdown(id='choice-candidates'),
|
||||
dcc.Graph(id='graph-output'),
|
||||
]
|
||||
),
|
||||
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
|
||||
]
|
||||
|
||||
|
||||
@callback(
|
||||
Output('object_text', 'children'),
|
||||
Input('dropdown-selection', 'value'),
|
||||
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
|
||||
headline = f'HObjektText: {obj_text}'
|
||||
return headline
|
||||
|
||||
|
||||
@callback(
|
||||
Output('choice-candidates', 'options'),
|
||||
Input('dropdown-selection', 'value'),
|
||||
@ -90,9 +108,10 @@ def update_obj_text(obj_id):
|
||||
def update_choice_candidates(obj_id):
|
||||
obj_id = int(obj_id)
|
||||
cands_obj_id = cands[obj_id]
|
||||
choices = list(range(1, len(cands_obj_id)+1))
|
||||
choices = list(range(1, len(cands_obj_id) + 1))
|
||||
return choices
|
||||
|
||||
|
||||
@callback(
|
||||
Output('graph-output', 'figure'),
|
||||
Input('choice-candidates', 'value'),
|
||||
@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
|
||||
title = f'HObjektText: {obj_text}'
|
||||
# cands
|
||||
cands_obj_id = cands[obj_id]
|
||||
cands_choice = cands_obj_id[int(index)-1]
|
||||
cands_choice = cands_obj_id[int(index) - 1]
|
||||
# data
|
||||
df = data.loc[list(cands_choice)].sort_index()
|
||||
# figure
|
||||
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
|
||||
title=title,
|
||||
hover_data=hover_data,
|
||||
)
|
||||
fig.update_traces(
|
||||
mode='markers+lines',
|
||||
marker=markers,
|
||||
marker_symbol='diamond'
|
||||
)
|
||||
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
|
||||
fig.update_xaxes(
|
||||
tickformat="%B\n%Y",
|
||||
tickformat='%B\n%Y',
|
||||
rangeslider_visible=True,
|
||||
)
|
||||
fig.update_yaxes(type='category')
|
||||
fig.update_layout(hovermode="x unified")
|
||||
fig.update_layout(hovermode='x unified')
|
||||
return fig
|
||||
|
||||
|
||||
@callback(
|
||||
[Output('table-candidates', 'data'),
|
||||
Output('table-candidates', 'columns')],
|
||||
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
||||
Input('choice-candidates', 'value'),
|
||||
State('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
@ -141,19 +156,20 @@ def update_table_candidates(index, obj_id):
|
||||
obj_id = int(obj_id)
|
||||
# cands
|
||||
cands_obj_id = cands[obj_id]
|
||||
cands_choice = cands_obj_id[int(index)-1]
|
||||
cands_choice = cands_obj_id[int(index) - 1]
|
||||
# data
|
||||
df = data.loc[list(cands_choice)].sort_index()
|
||||
df = (df
|
||||
.filter(items=table_feats, axis=1)
|
||||
.sort_values(by='ErstellungsDatum', ascending=True))
|
||||
cols = [{"name": i, "id": i} for i in df.columns]
|
||||
df = df.filter(items=table_feats, axis=1).sort_values(
|
||||
by='ErstellungsDatum', ascending=True
|
||||
)
|
||||
cols = [{'name': i, 'id': i} for i in df.columns]
|
||||
# convert dates to strings
|
||||
for col in table_feats_dates:
|
||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||
|
||||
|
||||
table_data = df.to_dict('records')
|
||||
return table_data, cols
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
app.run(debug=True)
|
||||
|
||||
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
Binary file not shown.
Binary file not shown.
663
test-notebooks/display_results.ipynb
Normal file
663
test-notebooks/display_results.ipynb
Normal file
@ -0,0 +1,663 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from lang_main import load_pickle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_files(path: str) -> tuple[Path, ...]:\n",
|
||||
" p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
" assert p.exists(), \"path does not exist\"\n",
|
||||
" return tuple(p.glob(r'*'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||
]
|
||||
},
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file = files[-1]\n",
|
||||
"ret = load_pickle(file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 89,
|
||||
"id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>entry</th>\n",
|
||||
" <th>len</th>\n",
|
||||
" <th>num_occur</th>\n",
|
||||
" <th>assoc_obj_ids</th>\n",
|
||||
" <th>num_assoc_obj_ids</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>162</th>\n",
|
||||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>92592</td>\n",
|
||||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||
" <td>206</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||
" <td>39</td>\n",
|
||||
" <td>3108</td>\n",
|
||||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>131</th>\n",
|
||||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||
" <td>37</td>\n",
|
||||
" <td>1619</td>\n",
|
||||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>160</th>\n",
|
||||
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>1265</td>\n",
|
||||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>140</th>\n",
|
||||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||
" <td>44</td>\n",
|
||||
" <td>687</td>\n",
|
||||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||
" <td>166</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2559</th>\n",
|
||||
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||
" <td>46</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[211]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2558</th>\n",
|
||||
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||
" <td>30</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[93]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2557</th>\n",
|
||||
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1707]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2556</th>\n",
|
||||
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||
" <td>173</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6782</th>\n",
|
||||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||
" <td>106</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>[306, 326]</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>4545 rows × 5 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" entry ... num_assoc_obj_ids\n",
|
||||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n",
|
||||
"33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n",
|
||||
"131 Tägliche Überprüfung der Ölabscheider ... 4\n",
|
||||
"160 Wöchentliche Kontrolle der C-Anlagen ... 11\n",
|
||||
"140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n",
|
||||
"... ... ... ...\n",
|
||||
"2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n",
|
||||
"2558 T-Warp-Let-Off1 schleppfehler ... 1\n",
|
||||
"2557 Fahrräder wurden gewartet und gereinigt. ... 1\n",
|
||||
"2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n",
|
||||
"6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n",
|
||||
"\n",
|
||||
"[4545 rows x 5 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 89,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ret[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||
]
|
||||
},
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file = files[-1]\n",
|
||||
"ret = load_pickle(file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>index</th>\n",
|
||||
" <th>entry</th>\n",
|
||||
" <th>len</th>\n",
|
||||
" <th>num_occur</th>\n",
|
||||
" <th>assoc_obj_ids</th>\n",
|
||||
" <th>num_assoc_obj_ids</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>162</td>\n",
|
||||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>92592</td>\n",
|
||||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||
" <td>206</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>33</td>\n",
|
||||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||
" <td>39</td>\n",
|
||||
" <td>3108</td>\n",
|
||||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>131</td>\n",
|
||||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||
" <td>37</td>\n",
|
||||
" <td>1619</td>\n",
|
||||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>160</td>\n",
|
||||
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>1265</td>\n",
|
||||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>140</td>\n",
|
||||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||
" <td>44</td>\n",
|
||||
" <td>687</td>\n",
|
||||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||
" <td>166</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6756</th>\n",
|
||||
" <td>2559</td>\n",
|
||||
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||
" <td>46</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[211]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6757</th>\n",
|
||||
" <td>2558</td>\n",
|
||||
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||
" <td>30</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[93]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6758</th>\n",
|
||||
" <td>2557</td>\n",
|
||||
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1707]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6759</th>\n",
|
||||
" <td>2556</td>\n",
|
||||
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||
" <td>173</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6760</th>\n",
|
||||
" <td>6782</td>\n",
|
||||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||
" <td>106</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>[306, 326]</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>4545 rows × 6 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" index ... num_assoc_obj_ids\n",
|
||||
"0 162 ... 206\n",
|
||||
"1 33 ... 74\n",
|
||||
"2 131 ... 4\n",
|
||||
"3 160 ... 11\n",
|
||||
"4 140 ... 166\n",
|
||||
"... ... ... ...\n",
|
||||
"6756 2559 ... 1\n",
|
||||
"6757 2558 ... 1\n",
|
||||
"6758 2557 ... 1\n",
|
||||
"6759 2556 ... 1\n",
|
||||
"6760 6782 ... 2\n",
|
||||
"\n",
|
||||
"[4545 rows x 6 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ret[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "cbf0b450-ec00-471f-9627-717e52c5471d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm.auto import tqdm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 84,
|
||||
"id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tup = tuple(i for i in range(100000000))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6ade9c6f4e61410fb93f35e43222705b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/100000000 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num = 0\n",
|
||||
"for i in tqdm(tup):\n",
|
||||
" num += i"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4999999950000000"
|
||||
]
|
||||
},
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "36366147-3632-4518-936e-878563305e49",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "17156aa0-8fd6-407b-b014-698df0e534a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"arr = np.random.rand(1000,1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"idx = np.argwhere(arr >= 0.97)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"30220"
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(idx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"id": "5b78436e-a828-42bd-a5ed-ae6045349391",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch = idx[:200]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"id": "75edc50e-b64c-4319-8f74-27653ed3452c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%timeit\n",
|
||||
"tuple(map(tuple, batch))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%timeit\n",
|
||||
"tuple(tuple(x) for x in batch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user