STRF for similarity duplicates, time analysis pipeline, enhanced config
This commit is contained in:
parent
5d2c97165a
commit
bb987e2108
@ -34,3 +34,15 @@ trials = [
|
||||
"plotly>=5.22.0",
|
||||
"dash>=2.17.0",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 94
|
||||
indent-width = 4
|
||||
target-version = "py311"
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "single"
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I"]
|
||||
@ -1,33 +1,43 @@
|
||||
import typing
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
from ihm_analyse import (
|
||||
SAVE_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
DO_PREPROCESSING,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
from lang_main import (
|
||||
TokenGraph,
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
Embedding,
|
||||
Index,
|
||||
TokenGraph,
|
||||
)
|
||||
from ihm_analyse.predefined_pipes import (
|
||||
pipe_target_feat,
|
||||
pipe_embds,
|
||||
from lang_main.constants import (
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
DO_PREPROCESSING,
|
||||
DO_TIME_ANALYSIS,
|
||||
DO_TOKEN_ANALYSIS,
|
||||
INPUT_PATH_FOLDER,
|
||||
PATH_TO_DATASET,
|
||||
SAVE_PATH_FOLDER,
|
||||
SKIP_GRAPH_POSTPROCESSING,
|
||||
SKIP_PREPROCESSING,
|
||||
SKIP_TIME_ANALYSIS,
|
||||
SKIP_TOKEN_ANALYSIS,
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
)
|
||||
|
||||
# Embedding,
|
||||
# PandasIndex,
|
||||
from lang_main.pipelines.predefined import (
|
||||
pipe_merge,
|
||||
pipe_target_feat,
|
||||
pipe_timeline,
|
||||
pipe_token_analysis,
|
||||
)
|
||||
"""
|
||||
# ** config parameters
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
|
||||
"""
|
||||
from lang_main.types import (
|
||||
ObjectID,
|
||||
TimelineCandidates,
|
||||
)
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
|
||||
# ** processing pipeline
|
||||
def run_preprocessing() -> DataFrame:
|
||||
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
|
||||
overwrite_existing=True,
|
||||
)
|
||||
# run pipelines
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
||||
)
|
||||
target_feat_data = ret[0]
|
||||
# only entries with more than threshold amount of characters
|
||||
data_filter = typing.cast(Series,
|
||||
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
dupl_idx_pairs, embds = typing.cast(
|
||||
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
|
||||
pipe_embds.run(starting_values=(subset_data,))
|
||||
)
|
||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
# dupl_idx_pairs, embds = typing.cast(
|
||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||
# pipe_embds.run(starting_values=(subset_data,)),
|
||||
# )
|
||||
# merge duplicates, results saved separately
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
|
||||
subset_data = target_feat_data.loc[data_filter].copy()
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame],
|
||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||
pipe_merge.run(starting_values=(subset_data,)),
|
||||
)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
return preprocessed_data
|
||||
|
||||
|
||||
def run_token_analysis(
|
||||
preprocessed_data: DataFrame,
|
||||
) -> TokenGraph:
|
||||
# build token graph
|
||||
(tk_graph,) = typing.cast(tuple[TokenGraph],
|
||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
|
||||
(tk_graph,) = typing.cast(
|
||||
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
|
||||
)
|
||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
||||
|
||||
return tk_graph
|
||||
|
||||
|
||||
def run_graph_postprocessing(
|
||||
tk_graph: TokenGraph,
|
||||
) -> TokenGraph:
|
||||
# filter graph by edge weight and remove single nodes (no connection)
|
||||
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
||||
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
||||
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
|
||||
filename='TokenGraph-filtered',
|
||||
directed=False)
|
||||
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
|
||||
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
|
||||
tk_graph_filtered.save_graph(
|
||||
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
||||
)
|
||||
tk_graph_filtered.to_pickle(
|
||||
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
)
|
||||
|
||||
return tk_graph_filtered
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
filename = 'without_nan'
|
||||
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
ret = load_pickle(loading_path)
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
ret = cast(
|
||||
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def verify_path(
|
||||
loading_path: Path,
|
||||
) -> None:
|
||||
if not loading_path.exists():
|
||||
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
|
||||
|
||||
|
||||
def main() -> None:
|
||||
pre_step_skipped: bool = False
|
||||
# ** preprocess
|
||||
if DO_PREPROCESSING:
|
||||
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
|
||||
preprocessed_data = run_preprocessing()
|
||||
else:
|
||||
elif not SKIP_PREPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
||||
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
ret = typing.cast(tuple[DataFrame],
|
||||
load_pickle(target_filepath))
|
||||
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
||||
verify_path(loading_path)
|
||||
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
|
||||
preprocessed_data = ret[0]
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS:
|
||||
preprocessed_data_trunc = typing.cast(DataFrame,
|
||||
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No preprocessing action selected. Skipped.')
|
||||
# sys.exit(0)
|
||||
# ** token analysis
|
||||
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
'Preprocessing step skipped. Token analysis cannot be performed.'
|
||||
)
|
||||
preprocessed_data_trunc = typing.cast(
|
||||
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
|
||||
) # type: ignore
|
||||
tk_graph = run_token_analysis(preprocessed_data_trunc)
|
||||
elif not SKIP_TOKEN_ANALYSIS:
|
||||
# !! hardcoded result filenames
|
||||
# whole graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph = TokenGraph.from_pickle(loading_path)
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING:
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
pre_step_skipped = True
|
||||
warnings.warn('No token analysis action selected. Skipped.')
|
||||
# ** graph postprocessing
|
||||
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
|
||||
if pre_step_skipped:
|
||||
raise RuntimeError(
|
||||
(
|
||||
'Preprocessing or token analysis step skipped. '
|
||||
'Graph postprocessing cannot be performed.'
|
||||
)
|
||||
)
|
||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
||||
elif not SKIP_GRAPH_POSTPROCESSING:
|
||||
# !! hardcoded result filenames
|
||||
# filtered graph
|
||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
|
||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
||||
verify_path(loading_path)
|
||||
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
||||
pre_step_skipped = False
|
||||
else:
|
||||
warnings.warn('No graph postprocessing action selected. Skipped.')
|
||||
# ** time analysis
|
||||
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
|
||||
# no check for fails, runs separately
|
||||
ret = run_time_analysis()
|
||||
elif not SKIP_TIME_ANALYSIS:
|
||||
...
|
||||
else:
|
||||
warnings.warn('No time analysis action selected. Skipped.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
BIN
scripts/inputs/without_nan.pkl
Normal file
BIN
scripts/inputs/without_nan.pkl
Normal file
Binary file not shown.
@ -1,17 +1,21 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = false
|
||||
token_analysis = true
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
59
scripts/lang_main_config.toml
Normal file
59
scripts/lang_main_config.toml
Normal file
@ -0,0 +1,59 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
|
||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = true
|
||||
token_analysis = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = true
|
||||
time_analysis = true
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
12
scripts/test.py
Normal file
12
scripts/test.py
Normal file
@ -0,0 +1,12 @@
|
||||
from lang_main.analysis.preprocessing import clean_string_slim
|
||||
from lang_main.constants import SAVE_PATH_FOLDER
|
||||
|
||||
print(SAVE_PATH_FOLDER)
|
||||
txt = """
|
||||
Wir feiern den Jahrestag, olé!
|
||||
tel:::: !!!!???? +++49 123 456 789
|
||||
|
||||
Doch leben wir länger.
|
||||
"""
|
||||
print(txt)
|
||||
print(clean_string_slim(txt))
|
||||
@ -1,18 +1,19 @@
|
||||
from typing import Final, Any
|
||||
import inspect
|
||||
import sys
|
||||
import logging
|
||||
from time import gmtime
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Any, Final
|
||||
|
||||
from lang_main.shared import (
|
||||
save_pickle,
|
||||
load_pickle,
|
||||
create_saving_folder,
|
||||
load_toml_config,
|
||||
)
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.shared import (
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
load_toml_config,
|
||||
save_pickle,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'save_pickle',
|
||||
@ -32,37 +33,30 @@ logging.basicConfig(
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
USE_INTERNAL_CONFIG: Final[bool] = True
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||
|
||||
pkg_dir = Path(__file__).parent
|
||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||
|
||||
# load config data: internal/external
|
||||
if USE_INTERNAL_CONFIG:
|
||||
curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
|
||||
pkg_dir = curr_file_dir.parent
|
||||
config_path = Path(pkg_dir, 'config.toml')
|
||||
loaded_config = load_toml_config(path_to_toml=config_path)
|
||||
CONFIG: Final[dict[str, Any]] = loaded_config.copy()
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||
else:
|
||||
raise NotImplementedError("External config data not implemented yet.")
|
||||
caller_file = Path(inspect.stack()[-1].filename)
|
||||
if not caller_file.exists():
|
||||
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
||||
cfg_path_external = caller_file.parent / CONFIG_FILENAME
|
||||
if not cfg_path_external.exists():
|
||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||
sys.exit(
|
||||
(
|
||||
'No config file was found. A new one with default values was created '
|
||||
'in the execution path. Please fill in the necessary values and '
|
||||
'restart the programm.'
|
||||
)
|
||||
)
|
||||
# raise NotImplementedError("External config data not implemented yet.")
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||
|
||||
# ** paths
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
# ** control
|
||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||
# ** export
|
||||
|
||||
# ** preprocessing
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
|
||||
CONFIG['preprocess']['filename_cossim_filter_candidates']
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
|
||||
CONFIG['preprocess']['threshold_amount_characters']
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
# ** token analysis
|
||||
|
||||
# ** graph postprocessing
|
||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
# ** time analysis
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
|
||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
import typing
|
||||
from typing import Any, Self, Literal, overload, Final
|
||||
import sys
|
||||
from collections.abc import Hashable
|
||||
from pathlib import Path
|
||||
import copy
|
||||
import sys
|
||||
import typing
|
||||
from collections.abc import Hashable, Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Final, Literal, Self, overload
|
||||
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from networkx import Graph, DiGraph
|
||||
import networkx as nx
|
||||
from networkx import DiGraph, Graph
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
|
||||
# TODO change logging behaviour, add logging to file
|
||||
LOGGING_DEFAULT: Final[bool] = False
|
||||
@ -31,8 +31,7 @@ def get_graph_metadata(
|
||||
min_edge_weight: int = 1_000_000
|
||||
max_edge_weight: int = 0
|
||||
for edge in graph.edges:
|
||||
weight = typing.cast(int,
|
||||
graph[edge[0]][edge[1]]['weight'])
|
||||
weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
|
||||
if weight < min_edge_weight:
|
||||
min_edge_weight = weight
|
||||
if weight > max_edge_weight:
|
||||
@ -54,18 +53,20 @@ def get_graph_metadata(
|
||||
)
|
||||
|
||||
if logging:
|
||||
logger.info((f"Graph properties: {num_nodes} Nodes, "
|
||||
f"{num_edges} Edges"))
|
||||
logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
|
||||
logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
|
||||
logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
|
||||
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
|
||||
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
|
||||
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
|
||||
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
|
||||
|
||||
return graph_info
|
||||
|
||||
|
||||
def update_graph(
|
||||
graph: Graph | DiGraph,
|
||||
parent: Hashable,
|
||||
child: Hashable,
|
||||
*,
|
||||
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
|
||||
parent: Hashable | None = None,
|
||||
child: Hashable | None = None,
|
||||
weight_connection: int = 1,
|
||||
) -> None:
|
||||
# !! not necessary to check for existence of nodes
|
||||
@ -78,7 +79,9 @@ def update_graph(
|
||||
graph.add_node(child)
|
||||
"""
|
||||
# check if edge not in Graph
|
||||
if not graph.has_edge(parent, child):
|
||||
if batch is not None:
|
||||
graph.add_edges_from(batch, weight=weight_connection)
|
||||
elif not graph.has_edge(parent, child):
|
||||
# create new edge, nodes will be created if not already present
|
||||
graph.add_edge(parent, child, weight=weight_connection)
|
||||
else:
|
||||
@ -87,16 +90,15 @@ def update_graph(
|
||||
weight += weight_connection
|
||||
graph[parent][child]['weight'] = weight
|
||||
|
||||
|
||||
# build undirected adjacency matrix
|
||||
def convert_graph_to_undirected(
|
||||
graph: DiGraph,
|
||||
logging: bool = LOGGING_DEFAULT,
|
||||
) -> Graph:
|
||||
# get adjacency matrix
|
||||
adj_mat = typing.cast(DataFrame,
|
||||
nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||
arr = typing.cast(npt.NDArray[np.uint32],
|
||||
adj_mat.to_numpy())
|
||||
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
|
||||
# build undirected array: adding edges of lower triangular matrix to upper one
|
||||
arr_upper = np.triu(arr)
|
||||
arr_lower = np.tril(arr)
|
||||
@ -104,18 +106,17 @@ def convert_graph_to_undirected(
|
||||
arr_new = arr_upper + arr_lower
|
||||
# assign new data and create graph
|
||||
adj_mat.loc[:] = arr_new # type: ignore
|
||||
graph_undir = typing.cast(Graph,
|
||||
nx.from_pandas_adjacency(df=adj_mat))
|
||||
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
|
||||
|
||||
# info about graph
|
||||
if logging:
|
||||
logger.info("Successfully converted graph to one with undirected edges.")
|
||||
logger.info('Successfully converted graph to one with undirected edges.')
|
||||
_ = get_graph_metadata(graph=graph_undir, logging=logging)
|
||||
|
||||
return graph_undir
|
||||
|
||||
class TokenGraph(DiGraph):
|
||||
|
||||
class TokenGraph(DiGraph):
|
||||
def __init__(
|
||||
self,
|
||||
name: str = 'TokenGraph',
|
||||
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
|
||||
return self.__str__()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (f"TokenGraph(name: {self.name}, number of nodes: "
|
||||
f"{len(self.nodes)}, number of edges: "
|
||||
f"{len(self.edges)})")
|
||||
return (
|
||||
f'TokenGraph(name: {self.name}, number of nodes: '
|
||||
f'{len(self.nodes)}, number of edges: '
|
||||
f'{len(self.edges)})'
|
||||
)
|
||||
|
||||
# !! only used to verify that saving was done correctly
|
||||
"""
|
||||
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
|
||||
self,
|
||||
inplace: Literal[True] = ...,
|
||||
logging: bool | None = ...,
|
||||
) -> None:
|
||||
...
|
||||
) -> None: ...
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace: Literal[False],
|
||||
logging: bool | None = ...,
|
||||
) -> Graph:
|
||||
...
|
||||
) -> Graph: ...
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace: bool = ...,
|
||||
logging: bool | None = ...
|
||||
) -> Graph | None:
|
||||
...
|
||||
self, inplace: bool = ..., logging: bool | None = ...
|
||||
) -> Graph | None: ...
|
||||
|
||||
def to_undirected(
|
||||
self,
|
||||
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
|
||||
if logging is None:
|
||||
logging = self.logging
|
||||
|
||||
self._undirected = convert_graph_to_undirected(graph=self,
|
||||
logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
||||
logging=logging)
|
||||
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(
|
||||
graph=self._undirected, logging=logging
|
||||
)
|
||||
if not inplace:
|
||||
return self._undirected
|
||||
|
||||
@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
|
||||
if logging is None:
|
||||
logging = self.logging
|
||||
|
||||
self._metadata_directed = get_graph_metadata(graph=self,
|
||||
logging=logging)
|
||||
self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
|
||||
if self._undirected is not None:
|
||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
|
||||
logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(
|
||||
graph=self._undirected, logging=logging
|
||||
)
|
||||
|
||||
def filter_by_edge_weight(
|
||||
self,
|
||||
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
|
||||
filtered_graph = self.copy()
|
||||
|
||||
for edge in original_graph_edges:
|
||||
weight = typing.cast(int,
|
||||
filtered_graph[edge[0]][edge[1]]['weight'])
|
||||
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
|
||||
if weight < threshold:
|
||||
filtered_graph.remove_edge(edge[0], edge[1])
|
||||
|
||||
@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
|
||||
filename: str | None = None,
|
||||
) -> Path:
|
||||
if filename is not None:
|
||||
saving_path = path.joinpath(f"{filename}")
|
||||
saving_path = path.joinpath(f'{filename}')
|
||||
else:
|
||||
saving_path = path.joinpath(f"{self.name}")
|
||||
saving_path = path.joinpath(f'{self.name}')
|
||||
|
||||
return saving_path
|
||||
|
||||
@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
|
||||
elif not directed and self._undirected is not None:
|
||||
target_graph = self._undirected
|
||||
else:
|
||||
raise ValueError("No undirected graph available.")
|
||||
raise ValueError('No undirected graph available.')
|
||||
|
||||
saving_path = saving_path.with_suffix('.graphml')
|
||||
nx.write_graphml(G=target_graph, path=saving_path)
|
||||
logger.info(("Successfully saved graph as GraphML file "
|
||||
f"under {saving_path}."))
|
||||
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
|
||||
|
||||
def to_pickle(
|
||||
self,
|
||||
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
|
||||
match path.suffix:
|
||||
case '.graphml':
|
||||
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
||||
logger.info(f"Successfully loaded graph from GraphML file {path}.")
|
||||
logger.info(f'Successfully loaded graph from GraphML file {path}.')
|
||||
case '.pkl' | '.pickle':
|
||||
graph = typing.cast(Self, load_pickle(path))
|
||||
logger.info(f"Successfully loaded graph from pickle file {path}.")
|
||||
logger.info(f'Successfully loaded graph from pickle file {path}.')
|
||||
case _:
|
||||
raise ValueError("File format not supported.")
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
return graph
|
||||
|
||||
@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
|
||||
path = Path(path)
|
||||
|
||||
if path.suffix not in ('.pkl', '.pickle'):
|
||||
raise ValueError("File format not supported.")
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
graph = typing.cast(Self, load_pickle(path))
|
||||
|
||||
|
||||
@ -1,29 +1,29 @@
|
||||
from typing import cast, Callable
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from itertools import combinations
|
||||
import re
|
||||
from math import factorial
|
||||
from pathlib import Path
|
||||
from typing import Callable, cast
|
||||
|
||||
import numpy as np
|
||||
from torch import Tensor
|
||||
from pandas import DataFrame, Series
|
||||
import pandas as pd
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from pandas import DataFrame, Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
from tqdm import tqdm
|
||||
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.shared import (
|
||||
candidates_by_index,
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
@ -67,11 +67,16 @@ def load_raw_data(
|
||||
parse_dates=date_cols,
|
||||
dayfirst=True,
|
||||
)
|
||||
logger.info("Loaded dataset successfully.")
|
||||
logger.info((f"Dataset properties: number of entries: {len(data)}, "
|
||||
f"number of features {len(data.columns)}"))
|
||||
logger.info('Loaded dataset successfully.')
|
||||
logger.info(
|
||||
(
|
||||
f'Dataset properties: number of entries: {len(data)}, '
|
||||
f'number of features {len(data.columns)}'
|
||||
)
|
||||
)
|
||||
return (data,)
|
||||
|
||||
|
||||
def remove_duplicates(
|
||||
data: DataFrame,
|
||||
) -> tuple[DataFrame]:
|
||||
@ -89,7 +94,7 @@ def remove_duplicates(
|
||||
"""
|
||||
# obtain info about duplicates over all features
|
||||
duplicates_filt = data.duplicated()
|
||||
logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
|
||||
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
|
||||
# drop duplicates
|
||||
wo_duplicates = data.drop_duplicates(ignore_index=True)
|
||||
duplicates_subset: list[str] = [
|
||||
@ -97,16 +102,26 @@ def remove_duplicates(
|
||||
'ObjektID',
|
||||
]
|
||||
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
|
||||
logger.info(("Number of duplicates over subset "
|
||||
f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
|
||||
wo_duplicates =\
|
||||
wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
|
||||
logger.info("Removed all duplicates from dataset successfully.")
|
||||
logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
|
||||
f"number of features {len(wo_duplicates.columns)}"))
|
||||
logger.info(
|
||||
(
|
||||
'Number of duplicates over subset '
|
||||
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
|
||||
)
|
||||
)
|
||||
wo_duplicates = wo_duplicates.drop_duplicates(
|
||||
subset=duplicates_subset, ignore_index=True
|
||||
).copy()
|
||||
logger.info('Removed all duplicates from dataset successfully.')
|
||||
logger.info(
|
||||
(
|
||||
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
|
||||
f'number of features {len(wo_duplicates.columns)}'
|
||||
)
|
||||
)
|
||||
|
||||
return (wo_duplicates,)
|
||||
|
||||
|
||||
def remove_NA(
|
||||
data: DataFrame,
|
||||
target_features: list[str] = [
|
||||
@ -128,15 +143,16 @@ def remove_NA(
|
||||
dataset with removed NA entries for given subset of features
|
||||
"""
|
||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||
logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
|
||||
logger.info(
|
||||
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
||||
)
|
||||
|
||||
return (wo_NA,)
|
||||
|
||||
|
||||
# ** (2) entry-based cleansing
|
||||
# following functions clean and prepare specific entries, not whole dataset
|
||||
def clean_string_slim(
|
||||
string: str
|
||||
) -> str:
|
||||
def clean_string_slim(string: str) -> str:
|
||||
"""mapping function to clean single string entries in a series (feature-wise)
|
||||
of the dataset, used to be applied element-wise for string features
|
||||
|
||||
@ -151,13 +167,16 @@ def clean_string_slim(
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
pattern = r'[\t\n\r\f\v]'
|
||||
pattern = r'[\t\n\r\f\v]+'
|
||||
string = re.sub(pattern, ' ', string)
|
||||
pattern = r'([,;.:!?-_\+]){2,}'
|
||||
# remove whitespaces at the beginning and the end
|
||||
string = re.sub(pattern, r'\1', string)
|
||||
string = string.strip()
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def entry_wise_cleansing(
|
||||
data: DataFrame,
|
||||
target_feature: str,
|
||||
@ -165,10 +184,16 @@ def entry_wise_cleansing(
|
||||
) -> tuple[DataFrame]:
|
||||
# apply given cleansing function to target feature
|
||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||
logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
|
||||
f"for feature >>{target_feature}<<"))
|
||||
logger.info(
|
||||
(
|
||||
f'Successfully applied entry-wise cleansing procedure '
|
||||
f'>>{cleansing_func.__name__}<< '
|
||||
f'for feature >>{target_feature}<<'
|
||||
)
|
||||
)
|
||||
return (data,)
|
||||
|
||||
|
||||
# ** in-depth analysis of one feature
|
||||
# following functions try to gain insights on a given feature of the IHM dataset such
|
||||
# as number of occurrences or associated Object IDs
|
||||
@ -178,7 +203,7 @@ def analyse_feature(
|
||||
) -> tuple[DataFrame]:
|
||||
# feature columns
|
||||
feature_entries = data[target_feature]
|
||||
logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
|
||||
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
|
||||
# obtain unique entries
|
||||
unique_feature_entries = feature_entries.unique()
|
||||
|
||||
@ -186,7 +211,7 @@ def analyse_feature(
|
||||
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
||||
result_df = pd.DataFrame(columns=cols)
|
||||
|
||||
for entry in tqdm(unique_feature_entries, mininterval=1.):
|
||||
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
||||
len_entry = len(entry)
|
||||
filt = data[target_feature] == entry
|
||||
temp = data[filt]
|
||||
@ -195,13 +220,10 @@ def analyse_feature(
|
||||
num_assoc_obj_ids = len(assoc_obj_ids)
|
||||
num_dupl = filt.sum()
|
||||
|
||||
conc_df = pd.DataFrame(data=[[
|
||||
entry,
|
||||
len_entry,
|
||||
num_dupl,
|
||||
assoc_obj_ids,
|
||||
num_assoc_obj_ids
|
||||
]], columns=cols)
|
||||
conc_df = pd.DataFrame(
|
||||
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
|
||||
columns=cols,
|
||||
)
|
||||
|
||||
result_df = pd.concat([result_df, conc_df], ignore_index=True)
|
||||
|
||||
@ -230,9 +252,9 @@ def build_embedding_map(
|
||||
is_STRF = True
|
||||
|
||||
if not any((is_spacy, is_STRF)):
|
||||
raise NotImplementedError("Model type unknown")
|
||||
raise NotImplementedError('Model type unknown')
|
||||
|
||||
for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
|
||||
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
||||
# verbose code: Pyright not inferring types correctly
|
||||
idx = cast(int, idx)
|
||||
text = cast(str, text)
|
||||
@ -246,12 +268,17 @@ def build_embedding_map(
|
||||
logger.debug(f'{embd.text=} has no vector')
|
||||
elif is_STRF:
|
||||
model = cast(SentenceTransformer, model)
|
||||
embd = cast(Tensor,
|
||||
model.encode(text, show_progress_bar=False))
|
||||
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||
embeddings[idx] = (embd, text)
|
||||
|
||||
return embeddings, (is_spacy, is_STRF)
|
||||
|
||||
|
||||
# adapt interface
|
||||
# use candidates by index function
|
||||
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
|
||||
|
||||
|
||||
# build similarity matrix out of embeddings
|
||||
def build_cosSim_matrix(
|
||||
data: Series,
|
||||
@ -259,10 +286,11 @@ def build_cosSim_matrix(
|
||||
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
||||
# build empty matrix
|
||||
df_index = data.index
|
||||
cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index,
|
||||
index=df_index, dtype=np.float32)
|
||||
cosineSim_idx_matrix = pd.DataFrame(
|
||||
data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
||||
)
|
||||
|
||||
logger.info("Start building embedding map...")
|
||||
logger.info('Start building embedding map...')
|
||||
|
||||
# obtain embeddings based on used model
|
||||
embds, (is_spacy, is_STRF) = build_embedding_map(
|
||||
@ -270,15 +298,15 @@ def build_cosSim_matrix(
|
||||
model=model,
|
||||
)
|
||||
|
||||
logger.info("Embedding map built successfully.")
|
||||
logger.info('Embedding map built successfully.')
|
||||
|
||||
# apply index based mapping for efficient handling of large texts
|
||||
combs = combinations(df_index, 2)
|
||||
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
||||
|
||||
logger.info("Start calculation of similarity scores...")
|
||||
logger.info('Start calculation of similarity scores...')
|
||||
|
||||
for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
|
||||
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
||||
# print(f"{idx1=}, {idx2=}")
|
||||
embd1 = embds[idx1][0]
|
||||
embd2 = embds[idx2][0]
|
||||
@ -296,10 +324,11 @@ def build_cosSim_matrix(
|
||||
|
||||
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
||||
|
||||
logger.info("Similarity scores calculated successfully.")
|
||||
logger.info('Similarity scores calculated successfully.')
|
||||
|
||||
return cosineSim_idx_matrix, embds
|
||||
|
||||
|
||||
# obtain index pairs with cosine similarity
|
||||
# greater than or equal to given threshold value
|
||||
def filt_thresh_cosSim_matrix(
|
||||
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
|
||||
Series
|
||||
series with multi index (index pairs) and corresponding similarity score
|
||||
"""
|
||||
cosineSim_filt = cast(Series,
|
||||
cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
|
||||
cosineSim_filt = cast(
|
||||
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
||||
)
|
||||
|
||||
return cosineSim_filt, embds
|
||||
|
||||
|
||||
def list_cosSim_dupl_candidates(
|
||||
cosineSim_filt: Series,
|
||||
embds: dict[int, tuple[Embedding, str]],
|
||||
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
|
||||
list containing relevant index pairs for entries with similarity score greater than
|
||||
given threshold
|
||||
"""
|
||||
logger.info("Start gathering of similarity candidates...")
|
||||
logger.info('Start gathering of similarity candidates...')
|
||||
# compare found duplicates
|
||||
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [[
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
embds[idx1][1],
|
||||
idx2,
|
||||
embds[idx2][1],
|
||||
score,
|
||||
]]
|
||||
]
|
||||
]
|
||||
# add candidates to collection DataFrame
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
if df_candidates.empty:
|
||||
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
|
||||
# save index pairs
|
||||
index_pairs.append((idx1, idx2))
|
||||
|
||||
logger.info("Similarity candidates gathered successfully.")
|
||||
logger.info('Similarity candidates gathered successfully.')
|
||||
|
||||
if save_candidates:
|
||||
if saving_path is None:
|
||||
raise ValueError(("Saving path must be provided if duplicate "
|
||||
"candidates should be saved."))
|
||||
raise ValueError(
|
||||
('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
||||
)
|
||||
elif pipeline is not None:
|
||||
target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_'
|
||||
+ filename + '.xlsx')
|
||||
target_filename = (
|
||||
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
||||
)
|
||||
elif pipeline is None:
|
||||
target_filename = f'{filename}.xlsx'
|
||||
logger.info("Saving similarity candidates...")
|
||||
logger.info('Saving similarity candidates...')
|
||||
target_path = saving_path.joinpath(target_filename)
|
||||
df_candidates.to_excel(target_path)
|
||||
logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
|
||||
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
|
||||
|
||||
return index_pairs, embds
|
||||
|
||||
|
||||
# TODO: change implementation fully to SentenceTransformer
|
||||
# usage of batch processing for embeddings, use candidate idx function
|
||||
# from time analysis --> moved to ``helpers.py``
|
||||
@ -419,16 +455,24 @@ def similar_ids_groups(
|
||||
yield list(id_group)
|
||||
"""
|
||||
|
||||
|
||||
def merge_similarity_dupl(
|
||||
data: DataFrame,
|
||||
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info("Start merging of similarity candidates...")
|
||||
logger.info('Start merging of similarity candidates...')
|
||||
|
||||
# data
|
||||
merged_data = data.copy()
|
||||
model_input = merged_data['entry']
|
||||
candidates_idx = candidates_by_index(
|
||||
data_model_input=model_input,
|
||||
model=model,
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
)
|
||||
# graph of similar ids
|
||||
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
|
||||
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
||||
|
||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||
similar_id_group = list(similar_id_group)
|
||||
@ -454,10 +498,11 @@ def merge_similarity_dupl(
|
||||
merged_data.update(merged_similar_data)
|
||||
merged_data = merged_data.drop(index=similar_id_group)
|
||||
|
||||
logger.info("Similarity candidates merged successfully.")
|
||||
logger.info('Similarity candidates merged successfully.')
|
||||
|
||||
return (merged_data.copy(),)
|
||||
|
||||
|
||||
# merge duplicates
|
||||
def merge_similarity_dupl_old(
|
||||
data: DataFrame,
|
||||
@ -469,8 +514,7 @@ def merge_similarity_dupl_old(
|
||||
# logger.info("Start merging of similarity candidates...")
|
||||
|
||||
# iterate over index pairs
|
||||
for (i1, i2) in tqdm(dupl_idx_pairs):
|
||||
|
||||
for i1, i2 in tqdm(dupl_idx_pairs):
|
||||
# if an entry does not exist any more, skip this pair
|
||||
if i1 not in index or i2 not in index:
|
||||
continue
|
||||
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
|
||||
given threshold
|
||||
"""
|
||||
|
||||
|
||||
# compare found duplicates
|
||||
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
||||
df_candidates = pd.DataFrame(columns=columns)
|
||||
|
||||
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
||||
|
||||
for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
|
||||
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
||||
# get texts for comparison
|
||||
text1 = embds[idx1][1]
|
||||
text2 = embds[idx2][1]
|
||||
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
|
||||
continue
|
||||
|
||||
# get text content from embedding as second tuple entry
|
||||
content = [[
|
||||
content = [
|
||||
[
|
||||
idx1,
|
||||
text1,
|
||||
idx2,
|
||||
text2,
|
||||
score,
|
||||
]]
|
||||
]
|
||||
]
|
||||
df_conc = pd.DataFrame(columns=columns, data=content)
|
||||
|
||||
df_candidates = pd.concat([df_candidates, df_conc])
|
||||
|
||||
@ -1,11 +1,71 @@
|
||||
from typing import cast
|
||||
from collections.abc import Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from networkx import Graph
|
||||
from pandas import Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from torch import Tensor
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||
from lang_main.types import PandasIndex
|
||||
from lang_main.analysis.graphs import update_graph, get_graph_metadata
|
||||
|
||||
|
||||
def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float = 0.5,
|
||||
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
"""function to filter candidate indices based on cosine similarity
|
||||
using SentenceTransformer model in batch mode,
|
||||
feed data as Series to retain information about indices of entries and
|
||||
access them later in the original dataset
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj_id : ObjectID
|
||||
_description_
|
||||
data_model_input : Series
|
||||
containing indices and text entries to process
|
||||
model : SentenceTransformer
|
||||
necessary SentenceTransformer model to encode text entries
|
||||
cos_sim_threshold : float, optional
|
||||
threshold for cosine similarity to filter candidates, by default 0.5
|
||||
|
||||
Yields
|
||||
------
|
||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
||||
ObjectID and tuple of index pairs which meet the cosine
|
||||
similarity threshold
|
||||
"""
|
||||
# embeddings
|
||||
batch = cast(list[str], data_model_input.to_list())
|
||||
embds = cast(
|
||||
Tensor,
|
||||
model.encode(
|
||||
batch,
|
||||
convert_to_numpy=False,
|
||||
convert_to_tensor=True,
|
||||
show_progress_bar=False,
|
||||
),
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||
np.fill_diagonal(cos_sim, 0.0)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
for idx_array in cos_sim_idx:
|
||||
idx_pair = cast(
|
||||
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
|
||||
)
|
||||
yield idx_pair
|
||||
|
||||
|
||||
def similar_index_connection_graph(
|
||||
@ -15,21 +75,21 @@ def similar_index_connection_graph(
|
||||
# use this graph to get connected components (indices which belong together)
|
||||
# retain semantic connection on whole dataset
|
||||
similar_id_graph = nx.Graph()
|
||||
for (idx1, idx2) in similar_idx_pairs:
|
||||
# inplace operation, parent/child do not really exist in undirected graph
|
||||
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
# for idx1, idx2 in similar_idx_pairs:
|
||||
# # inplace operation, parent/child do not really exist in undirected graph
|
||||
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
||||
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
|
||||
|
||||
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
|
||||
|
||||
return similar_id_graph, graph_info
|
||||
|
||||
# TODO check returning tuple
|
||||
|
||||
def similar_index_groups(
|
||||
similar_id_graph: Graph,
|
||||
) -> Iterator[tuple[PandasIndex, ...]]:
|
||||
# groups of connected indices
|
||||
ids_groups = cast(Iterator[set[PandasIndex]],
|
||||
nx.connected_components(G=similar_id_graph))
|
||||
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
|
||||
|
||||
for id_group in ids_groups:
|
||||
yield tuple(id_group)
|
||||
@ -1,21 +1,17 @@
|
||||
from typing import cast
|
||||
from collections.abc import Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from pandas import DataFrame, Series
|
||||
from torch import Tensor
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from tqdm.auto import tqdm # TODO: check deletion
|
||||
|
||||
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.analysis.shared import (
|
||||
candidates_by_index,
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
|
||||
|
||||
|
||||
def non_relevant_obj_ids(
|
||||
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> tuple[ObjectID, ...]:
|
||||
|
||||
data = data.copy()
|
||||
ids_to_ignore: set[ObjectID] = set()
|
||||
obj_ids = cast(Iterable[ObjectID], # actually NumPy array
|
||||
data[feature_obj_id].unique())
|
||||
obj_ids = cast(
|
||||
Iterable[ObjectID], # actually NumPy array
|
||||
data[feature_obj_id].unique(),
|
||||
)
|
||||
|
||||
for obj_id in obj_ids:
|
||||
feats_per_obj_id = cast(
|
||||
Series,
|
||||
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
|
||||
Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
|
||||
)
|
||||
# check for uniqueness of given feature for current ObjectID
|
||||
# ignore NaN values
|
||||
@ -46,14 +42,15 @@ def non_relevant_obj_ids(
|
||||
|
||||
return tuple(ids_to_ignore)
|
||||
|
||||
|
||||
def remove_non_relevant_obj_ids(
|
||||
data: DataFrame,
|
||||
thresh_unique_feat_per_id: int,
|
||||
*,
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> DataFrame:
|
||||
logger.info("Removing non-relevant ObjectIDs from dataset")
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Removing non-relevant ObjectIDs from dataset')
|
||||
data = data.copy()
|
||||
ids_to_ignore = non_relevant_obj_ids(
|
||||
data=data,
|
||||
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
|
||||
)
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
|
||||
logger.info("Non-relevant ObjectIDs removed successfully")
|
||||
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
|
||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||
|
||||
return data
|
||||
return (data,)
|
||||
|
||||
def filter_activities_per_obj_id(
|
||||
data: DataFrame,
|
||||
activity_feature: str = 'VorgangsTypName',
|
||||
relevant_activity_types: Iterable[str] = (
|
||||
'Reparaturauftrag (Portal)',
|
||||
),
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
threshold_num_activities: int = 1,
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
logger.info("Filtering activities per ObjectID")
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series,
|
||||
data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
|
||||
# index of series contains ObjectIDs
|
||||
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
||||
filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
|
||||
.isin(obj_ids_below_thresh))
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info("Activities per ObjectID filtered successfully")
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
def generate_model_input(
|
||||
data: DataFrame,
|
||||
@ -107,8 +74,8 @@ def generate_model_input(
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
),
|
||||
) -> DataFrame:
|
||||
logger.info("Generating concatenation of model input features")
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Generating concatenation of model input features')
|
||||
data = data.copy()
|
||||
model_input_features = list(model_input_features)
|
||||
input_features = data[model_input_features].fillna('').astype(str)
|
||||
@ -116,9 +83,40 @@ def generate_model_input(
|
||||
lambda x: ' - '.join(x),
|
||||
axis=1,
|
||||
)
|
||||
logger.info("Model input generated successfully")
|
||||
logger.info('Model input generated successfully')
|
||||
|
||||
return (data,)
|
||||
|
||||
|
||||
def filter_activities_per_obj_id(
|
||||
data: DataFrame,
|
||||
activity_feature: str = 'VorgangsTypName',
|
||||
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
threshold_num_activities: int = 1,
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
logger.info('Filtering activities per ObjectID')
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
|
||||
)
|
||||
# filter for ObjectIDs with more than given number of activities
|
||||
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
|
||||
# index of series contains ObjectIDs
|
||||
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
|
||||
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
|
||||
obj_ids_below_thresh
|
||||
)
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info('Activities per ObjectID filtered successfully')
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
return data
|
||||
|
||||
# for each obj_id in relevant_obj_ids
|
||||
## filter data for obj_id
|
||||
@ -130,6 +128,7 @@ def generate_model_input(
|
||||
## obtain idx pairs, yield
|
||||
## use idx pairs to get idx values of series
|
||||
|
||||
|
||||
def get_timeline_candidates_index(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
|
||||
# already sorted ObjIDs (descending regarding number of activities)
|
||||
obj_ids = cast(Iterable[ObjectID],
|
||||
num_activities_per_obj_id.index)
|
||||
obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
|
||||
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj_id = cast(
|
||||
DataFrame,
|
||||
data.loc[data[feature_obj_id]==obj_id]
|
||||
)
|
||||
data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
|
||||
data_model_input = data_per_obj_id[model_input_feature]
|
||||
|
||||
candidates_idx = candidates_by_index(
|
||||
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
)
|
||||
# directly process candidates
|
||||
candidates_idx = tuple(candidates_idx)
|
||||
# candidates_idx = tuple(candidates_idx)
|
||||
similar_id_graph, _ = similar_index_connection_graph(
|
||||
similar_idx_pairs=candidates_idx,
|
||||
)
|
||||
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
|
||||
for index_group in similar_index_groups(similar_id_graph):
|
||||
yield obj_id, index_group
|
||||
|
||||
|
||||
# TODO: check application for duplicate removal
|
||||
def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float = 0.5,
|
||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
"""function to filter candidate indices based on cosine similarity
|
||||
using SentenceTransformer model in batch mode,
|
||||
feed data as Series to retain information about indices of entries and
|
||||
access them later in the original dataset
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj_id : ObjectID
|
||||
_description_
|
||||
data_model_input : Series
|
||||
containing indices and text entries to process
|
||||
model : SentenceTransformer
|
||||
necessary SentenceTransformer model to encode text entries
|
||||
cos_sim_threshold : float, optional
|
||||
threshold for cosine similarity to filter candidates, by default 0.5
|
||||
|
||||
Yields
|
||||
------
|
||||
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
|
||||
ObjectID and tuple of index pairs which meet the cosine
|
||||
similarity threshold
|
||||
"""
|
||||
# embeddings
|
||||
batch = cast(list[str],
|
||||
data_model_input.to_list())
|
||||
embds = cast(
|
||||
Tensor,
|
||||
model.encode(
|
||||
batch,
|
||||
convert_to_numpy=False,
|
||||
convert_to_tensor=True,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(
|
||||
npt.NDArray,
|
||||
sentence_transformers.util.cos_sim(embds, embds).numpy()
|
||||
)
|
||||
np.fill_diagonal(cos_sim, 0.)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
for idx_array in cos_sim_idx:
|
||||
idx_pair = cast(
|
||||
tuple[np.int64, np.int64],
|
||||
tuple(data_model_input.index[idx] for idx in idx_array)
|
||||
)
|
||||
yield idx_pair
|
||||
|
||||
|
||||
def transform_timeline_candidates(
|
||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||
) -> TimelineCandidates:
|
||||
@ -259,20 +199,52 @@ def transform_timeline_candidates(
|
||||
|
||||
return candidates_by_obj_id
|
||||
|
||||
def map_obj_texts(
|
||||
|
||||
def map_obj_id_to_texts(
|
||||
data: DataFrame,
|
||||
obj_ids: Iterable[ObjectID],
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> dict[ObjectID, str]:
|
||||
data = data.copy()
|
||||
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
|
||||
|
||||
obj_id_to_text: dict[ObjectID, str] = {}
|
||||
|
||||
for obj_id in obj_ids:
|
||||
data_per_obj = cast(
|
||||
DataFrame,
|
||||
data.loc[data['ObjektID']==obj_id]
|
||||
)
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
|
||||
# just take first entry
|
||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||
obj_text = obj_text.strip(r' ,.:')
|
||||
obj_id_to_text[obj_id] = obj_text
|
||||
|
||||
return obj_id_to_text
|
||||
|
||||
|
||||
def get_timeline_candidates(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
*,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
logger.info('Obtaining timeline candidates...')
|
||||
candidates = get_timeline_candidates_index(
|
||||
data=data,
|
||||
num_activities_per_obj_id=num_activities_per_obj_id,
|
||||
model=model,
|
||||
cos_sim_threshold=cos_sim_threshold,
|
||||
feature_obj_id=feature_obj_id,
|
||||
model_input_feature=model_input_feature,
|
||||
)
|
||||
tl_candidates = transform_timeline_candidates(candidates)
|
||||
logger.info('Timeline candidates obtained successfully.')
|
||||
# text mapping to obtain object descriptors
|
||||
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
||||
map_obj_text = map_obj_id_to_texts(
|
||||
data=data,
|
||||
feature_obj_id=feature_obj_id,
|
||||
)
|
||||
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||
|
||||
return tl_candidates, map_obj_text
|
||||
|
||||
@ -1,21 +1,20 @@
|
||||
from typing import cast
|
||||
import re
|
||||
from itertools import combinations
|
||||
from collections.abc import Iterator
|
||||
from itertools import combinations
|
||||
from typing import cast
|
||||
|
||||
from dateutil.parser import parse
|
||||
from spacy.tokens.token import Token as SpacyToken
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from pandas import DataFrame
|
||||
from spacy.lang.de import German as GermanSpacyModel
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from spacy.tokens.token import Token as SpacyToken
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
from lang_main.analysis.graphs import (
|
||||
update_graph,
|
||||
TokenGraph,
|
||||
update_graph,
|
||||
)
|
||||
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
|
||||
# ** Logging
|
||||
# LOGGING_LEVEL = 'INFO'
|
||||
@ -38,13 +37,14 @@ TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||
|
||||
# ** obtaining connection in texts
|
||||
|
||||
def pre_clean_word(string: str) -> str:
|
||||
|
||||
def pre_clean_word(string: str) -> str:
|
||||
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
||||
string = re.sub(pattern, '', string)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
|
||||
def is_str_date(
|
||||
string: str,
|
||||
@ -67,10 +67,10 @@ def is_str_date(
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def obtain_relevant_descendants(
|
||||
token: SpacyToken,
|
||||
) -> Iterator[SpacyToken]:
|
||||
|
||||
for descendant in token.subtree:
|
||||
# subtrees contain the token itself
|
||||
# if current element is token skip this element
|
||||
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
|
||||
if is_str_date(string=descendant.text):
|
||||
continue
|
||||
|
||||
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
|
||||
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
|
||||
logger.debug(
|
||||
(
|
||||
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
|
||||
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
|
||||
)
|
||||
)
|
||||
|
||||
# eliminate cases of cross-references with verbs
|
||||
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
|
||||
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
|
||||
if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
|
||||
descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
|
||||
):
|
||||
continue
|
||||
# skip cases in which descendant is indirect POS with others than verbs
|
||||
elif descendant.pos_ in POS_INDIRECT:
|
||||
@ -99,6 +104,7 @@ def obtain_relevant_descendants(
|
||||
|
||||
# TODO look at results and fine-tune function accordingly
|
||||
|
||||
|
||||
def add_doc_info_to_graph(
|
||||
graph: TokenGraph,
|
||||
doc: SpacyDoc,
|
||||
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
|
||||
graph=graph,
|
||||
parent=token.lemma_,
|
||||
child=descendant.lemma_,
|
||||
weight_connection=weight
|
||||
weight_connection=weight,
|
||||
)
|
||||
else:
|
||||
# if indirect POS, make connection between all associated words
|
||||
@ -139,6 +145,7 @@ def add_doc_info_to_graph(
|
||||
weight_connection=weight,
|
||||
)
|
||||
|
||||
|
||||
def build_token_graph(
|
||||
data: DataFrame,
|
||||
model: GermanSpacyModel,
|
||||
|
||||
55
src/lang_main/constants.py
Normal file
55
src/lang_main/constants.py
Normal file
@ -0,0 +1,55 @@
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from lang_main import CONFIG
|
||||
|
||||
# ** paths
|
||||
INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
|
||||
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
|
||||
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
|
||||
# ** control
|
||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
||||
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
||||
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||
# ** export
|
||||
|
||||
# ** preprocessing
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
|
||||
'filename_cossim_filter_candidates'
|
||||
]
|
||||
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
||||
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
|
||||
'threshold_amount_characters'
|
||||
]
|
||||
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
|
||||
# ** token analysis
|
||||
|
||||
# ** graph postprocessing
|
||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
# ** time analysis.uniqueness
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||
'threshold_unique_texts'
|
||||
]
|
||||
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||
'criterion_feature'
|
||||
]
|
||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||
# ** time_analysis.model_input
|
||||
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
|
||||
CONFIG['time_analysis']['model_input']['input_features']
|
||||
)
|
||||
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
||||
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
|
||||
CONFIG['time_analysis']['model_input']['activity_types']
|
||||
)
|
||||
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
|
||||
'threshold_num_acitivities'
|
||||
]
|
||||
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
|
||||
'threshold_similarity'
|
||||
]
|
||||
56
src/lang_main/lang_main_config.toml
Normal file
56
src/lang_main/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
@ -1,5 +1,5 @@
|
||||
from typing import Final
|
||||
import logging
|
||||
from typing import Final
|
||||
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
from typing import Any
|
||||
#from types import FunctionType
|
||||
import sys
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.shared import save_pickle, load_pickle
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
"""Error describing that no action is available in the current pipeline"""
|
||||
|
||||
class BasePipeline():
|
||||
|
||||
class BasePipeline:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
@ -27,6 +25,8 @@ class BasePipeline():
|
||||
self.name = name
|
||||
# working directory for pipeline == output path
|
||||
self.working_dir = working_dir
|
||||
# if not self.working_dir.exists():
|
||||
# self.working_dir.mkdir(parents=True)
|
||||
|
||||
# container for actions to perform during pass
|
||||
self.actions: list[Callable] = []
|
||||
@ -39,8 +39,10 @@ class BasePipeline():
|
||||
self._intermediate_result: Any | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (f"{self.__class__.__name__}(name: {self.name}, "
|
||||
f"working dir: {self.working_dir}, contents: {self.action_names})")
|
||||
return (
|
||||
f'{self.__class__.__name__}(name: {self.name}, '
|
||||
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||
)
|
||||
|
||||
@property
|
||||
def intermediate_result(self) -> Any:
|
||||
@ -60,8 +62,9 @@ class BasePipeline():
|
||||
self.actions_kwargs.append(action_kwargs.copy())
|
||||
self.is_save_result.append(save_result)
|
||||
else:
|
||||
raise TypeError(("Action must be custom function, "
|
||||
f"but is of type >>{type(action)}<<."))
|
||||
raise TypeError(
|
||||
f'Action must be custom function, but is of type >>{type(action)}<<.'
|
||||
)
|
||||
|
||||
# TODO: add multiple entries by utilising simple add method
|
||||
"""
|
||||
@ -107,13 +110,14 @@ class BasePipeline():
|
||||
return data
|
||||
|
||||
def prep_run(self) -> None:
|
||||
logger.info(f"Starting processing pipeline >>{self.name}<<...")
|
||||
logger.info(f'Starting processing pipeline >>{self.name}<<...')
|
||||
# progress tracking
|
||||
self.curr_proc_idx = 1
|
||||
# check if performable actions available
|
||||
if len(self.actions) == 0:
|
||||
raise NoPerformableActionError(("The pipeline does not contain any "
|
||||
"performable actions."))
|
||||
raise NoPerformableActionError(
|
||||
('The pipeline does not contain any ' 'performable actions.')
|
||||
)
|
||||
|
||||
def run(
|
||||
self,
|
||||
@ -135,6 +139,6 @@ class BasePipeline():
|
||||
# processing tracking
|
||||
self.curr_proc_idx += 1
|
||||
|
||||
logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
|
||||
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
|
||||
|
||||
return ret
|
||||
@ -1,57 +1,144 @@
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import spacy
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from lang_main import (
|
||||
SAVE_PATH_FOLDER,
|
||||
DATE_COLS,
|
||||
FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
THRESHOLD_SIMILARITY,
|
||||
)
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.analysis.preprocessing import (
|
||||
load_raw_data,
|
||||
remove_duplicates,
|
||||
remove_NA,
|
||||
analyse_feature,
|
||||
clean_string_slim,
|
||||
entry_wise_cleansing,
|
||||
analyse_feature,
|
||||
build_cosSim_matrix,
|
||||
filt_thresh_cosSim_matrix,
|
||||
list_cosSim_dupl_candidates,
|
||||
load_raw_data,
|
||||
merge_similarity_dupl,
|
||||
remove_duplicates,
|
||||
remove_NA,
|
||||
)
|
||||
from lang_main.analysis.timeline import (
|
||||
filter_activities_per_obj_id,
|
||||
generate_model_input,
|
||||
get_timeline_candidates,
|
||||
remove_non_relevant_obj_ids,
|
||||
)
|
||||
from lang_main.analysis.tokens import build_token_graph
|
||||
from lang_main.constants import (
|
||||
ACTIVITY_FEATURE,
|
||||
ACTIVITY_TYPES,
|
||||
DATE_COLS,
|
||||
FEATURE_NAME_OBJ_ID,
|
||||
MODEL_INPUT_FEATURES,
|
||||
SAVE_PATH_FOLDER,
|
||||
THRESHOLD_NUM_ACTIVITIES,
|
||||
THRESHOLD_SIMILARITY,
|
||||
THRESHOLD_TIMELINE_SIMILARITY,
|
||||
THRESHOLD_UNIQUE_TEXTS,
|
||||
UNIQUE_CRITERION_FEATURE,
|
||||
)
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
|
||||
# ** pipeline configuration
|
||||
# ** target feature preparation
|
||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
|
||||
pipe_target_feat.add(
|
||||
load_raw_data,
|
||||
{
|
||||
'date_cols': DATE_COLS,
|
||||
},
|
||||
)
|
||||
pipe_target_feat.add(remove_duplicates)
|
||||
pipe_target_feat.add(remove_NA, save_result=True)
|
||||
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
|
||||
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
|
||||
pipe_target_feat.add(
|
||||
entry_wise_cleansing,
|
||||
{
|
||||
'target_feature': 'VorgangsBeschreibung',
|
||||
'cleansing_func': clean_string_slim,
|
||||
},
|
||||
)
|
||||
pipe_target_feat.add(
|
||||
analyse_feature,
|
||||
{
|
||||
'target_feature': 'VorgangsBeschreibung',
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
# output: DataFrame containing target feature with
|
||||
# number of occurrences and associated ObjectIDs
|
||||
|
||||
# ** embedding pipe
|
||||
# ?? still needed?
|
||||
# using similarity between entries to catch duplicates with typo or similar content
|
||||
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
|
||||
model_spacy = spacy.load('de_dep_news_trf')
|
||||
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
||||
|
||||
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
|
||||
pipe_embds.add(
|
||||
list_cosSim_dupl_candidates,
|
||||
{'save_candidates': True,
|
||||
'saving_path': SAVE_PATH_FOLDER,
|
||||
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
'pipeline': pipe_embds}, save_result=True)
|
||||
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
|
||||
# pipe_embds.add(
|
||||
# filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
|
||||
# )
|
||||
# pipe_embds.add(
|
||||
# list_cosSim_dupl_candidates,
|
||||
# {
|
||||
# 'save_candidates': True,
|
||||
# 'saving_path': SAVE_PATH_FOLDER,
|
||||
# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
|
||||
# 'pipeline': pipe_embds,
|
||||
# },
|
||||
# save_result=True,
|
||||
# )
|
||||
|
||||
# ** Merge duplicates
|
||||
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||
# pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||
pipe_merge.add(
|
||||
merge_similarity_dupl,
|
||||
{
|
||||
'model': model_stfr,
|
||||
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
# ** token analysis
|
||||
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
|
||||
pipe_token_analysis.add(
|
||||
build_token_graph,
|
||||
{
|
||||
'model': model_spacy,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
|
||||
# ** timeline analysis
|
||||
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_timeline.add(
|
||||
remove_non_relevant_obj_ids,
|
||||
{
|
||||
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
|
||||
'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
pipe_timeline.add(
|
||||
generate_model_input,
|
||||
{
|
||||
'target_feature_name': 'nlp_model_input',
|
||||
'model_input_features': MODEL_INPUT_FEATURES,
|
||||
},
|
||||
)
|
||||
pipe_timeline.add(
|
||||
filter_activities_per_obj_id,
|
||||
{
|
||||
'activity_feature': ACTIVITY_FEATURE,
|
||||
'relevant_activity_types': ACTIVITY_TYPES,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
|
||||
},
|
||||
)
|
||||
pipe_timeline.add(
|
||||
get_timeline_candidates,
|
||||
{
|
||||
'model': model_stfr,
|
||||
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
'model_input_feature': 'nlp_model_input',
|
||||
},
|
||||
save_result=True,
|
||||
)
|
||||
|
||||
@ -1,38 +1,47 @@
|
||||
from typing import Any
|
||||
import os
|
||||
import shutil
|
||||
import pickle
|
||||
import shutil
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lang_main.loggers import logger_shared_helpers as logger
|
||||
|
||||
|
||||
# ** Lib
|
||||
def create_saving_folder(
|
||||
saving_path_folder: str | Path,
|
||||
overwrite_existing: bool = False,
|
||||
) -> None:
|
||||
# check for existence of given path
|
||||
if not os.path.exists(saving_path_folder):
|
||||
os.makedirs(saving_path_folder)
|
||||
if isinstance(saving_path_folder, str):
|
||||
saving_path_folder = Path(saving_path_folder)
|
||||
if not saving_path_folder.exists():
|
||||
saving_path_folder.mkdir(parents=True)
|
||||
else:
|
||||
if overwrite_existing:
|
||||
# overwrite if desired (deletes whole path and re-creates it)
|
||||
shutil.rmtree(saving_path_folder)
|
||||
os.makedirs(saving_path_folder)
|
||||
else:
|
||||
logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
|
||||
"unchanged. If you want to overwrite this path, use parameter "
|
||||
">>overwrite_existing<<."))
|
||||
logger.info(
|
||||
(
|
||||
f'Path >>{saving_path_folder}<< already exists and remained '
|
||||
f'unchanged. If you want to overwrite this path, use parameter '
|
||||
f'>>overwrite_existing<<.'
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def load_toml_config(
|
||||
path_to_toml: str | Path,
|
||||
) -> dict[str, Any]:
|
||||
with open(path_to_toml, "rb") as f:
|
||||
with open(path_to_toml, 'rb') as f:
|
||||
data = tomllib.load(f)
|
||||
logger.info("Loaded TOML config file successfully.")
|
||||
logger.info('Loaded TOML config file successfully.')
|
||||
return data
|
||||
|
||||
|
||||
# saving and loading using pickle
|
||||
# careful: pickling from unknown sources can be dangerous
|
||||
def save_pickle(
|
||||
@ -41,16 +50,18 @@ def save_pickle(
|
||||
) -> None:
|
||||
with open(path, 'wb') as file:
|
||||
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(f"Saved file successfully under {path}")
|
||||
logger.info(f'Saved file successfully under {path}')
|
||||
|
||||
|
||||
def load_pickle(
|
||||
path: str | Path,
|
||||
) -> Any:
|
||||
with open(path, 'rb') as file:
|
||||
obj = pickle.load(file)
|
||||
logger.info("Loaded file successfully.")
|
||||
logger.info('Loaded file successfully.')
|
||||
return obj
|
||||
|
||||
|
||||
# TODO: remove, too specialised for common application
|
||||
"""
|
||||
def filter_candidates_idx(
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import TypeAlias, Literal
|
||||
from typing import Literal, TypeAlias
|
||||
|
||||
import numpy as np
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
|
||||
@ -13,29 +13,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'ihm_analyse'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
|
||||
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from ihm_analyse import CONFIG\n",
|
||||
"from ihm_analyse.lib.preprocess import (\n",
|
||||
"from lang_main import CONFIG\n",
|
||||
"from lang_main.lib.preprocess import (\n",
|
||||
" load_raw_data,\n",
|
||||
" remove_duplicates,\n",
|
||||
" remove_NA,\n",
|
||||
@ -47,8 +43,8 @@
|
||||
" list_cosSim_dupl_candidates,\n",
|
||||
" merge_similarity_dupl,\n",
|
||||
")\n",
|
||||
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||||
"from ihm_analyse.lib.helpers import (\n",
|
||||
"from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
|
||||
"from lang_main.lib.helpers import (\n",
|
||||
" save_pickle, \n",
|
||||
" load_pickle, \n",
|
||||
" create_saving_folder,\n",
|
||||
|
||||
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
BIN
test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
Normal file
Binary file not shown.
Binary file not shown.
@ -1,28 +1,42 @@
|
||||
from typing import cast
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from dash import (
|
||||
Dash,
|
||||
html,
|
||||
dcc,
|
||||
callback,
|
||||
Output,
|
||||
Input,
|
||||
Output,
|
||||
State,
|
||||
callback,
|
||||
dash_table,
|
||||
dcc,
|
||||
html,
|
||||
)
|
||||
import plotly.express as px
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main import load_pickle
|
||||
from lang_main.types import TimelineCandidates, ObjectID
|
||||
from lang_main.types import ObjectID, TimelineCandidates
|
||||
from pandas import DataFrame
|
||||
|
||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||
|
||||
# ** data
|
||||
data = cast(DataFrame, load_pickle('./data.pkl'))
|
||||
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
|
||||
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
|
||||
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||
p_tl = Path(
|
||||
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||
)
|
||||
ret = cast(DataFrame, load_pickle(p_df))
|
||||
data = ret[0]
|
||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||
cands = ret[0]
|
||||
texts = ret[1]
|
||||
|
||||
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
|
||||
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
|
||||
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
|
||||
# data = cast(DataFrame, load_pickle(p_df))
|
||||
# cands = cast(TimelineCandidates, load_pickle(p_cands))
|
||||
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
|
||||
|
||||
table_feats = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
@ -53,23 +67,26 @@ app = Dash(prevent_initial_callbacks=True)
|
||||
|
||||
app.layout = [
|
||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
|
||||
html.Div(children=[
|
||||
html.Div(
|
||||
children=[
|
||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||
dcc.Dropdown(
|
||||
list(cands.keys()),
|
||||
id='dropdown-selection',
|
||||
placeholder="ObjektID auswählen...",
|
||||
)
|
||||
]),
|
||||
html.Div(children=[
|
||||
placeholder='ObjektID auswählen...',
|
||||
),
|
||||
]
|
||||
),
|
||||
html.Div(
|
||||
children=[
|
||||
html.H3(id='object_text'),
|
||||
dcc.Dropdown(id='choice-candidates'),
|
||||
dcc.Graph(id='graph-output'),
|
||||
]),
|
||||
html.Div(children=[
|
||||
dash_table.DataTable(id='table-candidates')
|
||||
]),
|
||||
]
|
||||
),
|
||||
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
|
||||
]
|
||||
|
||||
|
||||
@callback(
|
||||
Output('object_text', 'children'),
|
||||
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
|
||||
headline = f'HObjektText: {obj_text}'
|
||||
return headline
|
||||
|
||||
|
||||
@callback(
|
||||
Output('choice-candidates', 'options'),
|
||||
Input('dropdown-selection', 'value'),
|
||||
@ -93,6 +111,7 @@ def update_choice_candidates(obj_id):
|
||||
choices = list(range(1, len(cands_obj_id) + 1))
|
||||
return choices
|
||||
|
||||
|
||||
@callback(
|
||||
Output('graph-output', 'figure'),
|
||||
Input('choice-candidates', 'value'),
|
||||
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
|
||||
title=title,
|
||||
hover_data=hover_data,
|
||||
)
|
||||
fig.update_traces(
|
||||
mode='markers+lines',
|
||||
marker=markers,
|
||||
marker_symbol='diamond'
|
||||
)
|
||||
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
|
||||
fig.update_xaxes(
|
||||
tickformat="%B\n%Y",
|
||||
tickformat='%B\n%Y',
|
||||
rangeslider_visible=True,
|
||||
)
|
||||
fig.update_yaxes(type='category')
|
||||
fig.update_layout(hovermode="x unified")
|
||||
fig.update_layout(hovermode='x unified')
|
||||
return fig
|
||||
|
||||
|
||||
@callback(
|
||||
[Output('table-candidates', 'data'),
|
||||
Output('table-candidates', 'columns')],
|
||||
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
||||
Input('choice-candidates', 'value'),
|
||||
State('dropdown-selection', 'value'),
|
||||
prevent_initial_call=True,
|
||||
@ -144,10 +159,10 @@ def update_table_candidates(index, obj_id):
|
||||
cands_choice = cands_obj_id[int(index) - 1]
|
||||
# data
|
||||
df = data.loc[list(cands_choice)].sort_index()
|
||||
df = (df
|
||||
.filter(items=table_feats, axis=1)
|
||||
.sort_values(by='ErstellungsDatum', ascending=True))
|
||||
cols = [{"name": i, "id": i} for i in df.columns]
|
||||
df = df.filter(items=table_feats, axis=1).sort_values(
|
||||
by='ErstellungsDatum', ascending=True
|
||||
)
|
||||
cols = [{'name': i, 'id': i} for i in df.columns]
|
||||
# convert dates to strings
|
||||
for col in table_feats_dates:
|
||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
|
||||
table_data = df.to_dict('records')
|
||||
return table_data, cols
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
56
test-notebooks/dashboard/lang_main_config.toml
Normal file
@ -0,0 +1,56 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
results = './results/test_new2/'
|
||||
dataset = './01_2_Rohdaten_neu/Export4.csv'
|
||||
#results = './results/Export7/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||
#results = './results/Export7_trunc/'
|
||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||
|
||||
[control]
|
||||
preprocessing = true
|
||||
preprocessing_skip = false
|
||||
token_analysis = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing = false
|
||||
graph_postprocessing_skip = false
|
||||
time_analysis = false
|
||||
time_analysis_skip = false
|
||||
|
||||
#[export_filenames]
|
||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
|
||||
[preprocess]
|
||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsTypName',
|
||||
'VorgangsArtText',
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
Binary file not shown.
Binary file not shown.
663
test-notebooks/display_results.ipynb
Normal file
663
test-notebooks/display_results.ipynb
Normal file
@ -0,0 +1,663 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from lang_main import load_pickle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_files(path: str) -> tuple[Path, ...]:\n",
|
||||
" p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
" assert p.exists(), \"path does not exist\"\n",
|
||||
" return tuple(p.glob(r'*'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||
]
|
||||
},
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file = files[-1]\n",
|
||||
"ret = load_pickle(file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 89,
|
||||
"id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>entry</th>\n",
|
||||
" <th>len</th>\n",
|
||||
" <th>num_occur</th>\n",
|
||||
" <th>assoc_obj_ids</th>\n",
|
||||
" <th>num_assoc_obj_ids</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>162</th>\n",
|
||||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>92592</td>\n",
|
||||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||
" <td>206</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||
" <td>39</td>\n",
|
||||
" <td>3108</td>\n",
|
||||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>131</th>\n",
|
||||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||
" <td>37</td>\n",
|
||||
" <td>1619</td>\n",
|
||||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>160</th>\n",
|
||||
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>1265</td>\n",
|
||||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>140</th>\n",
|
||||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||
" <td>44</td>\n",
|
||||
" <td>687</td>\n",
|
||||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||
" <td>166</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2559</th>\n",
|
||||
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||
" <td>46</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[211]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2558</th>\n",
|
||||
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||
" <td>30</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[93]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2557</th>\n",
|
||||
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1707]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2556</th>\n",
|
||||
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||
" <td>173</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6782</th>\n",
|
||||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||
" <td>106</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>[306, 326]</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>4545 rows × 5 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" entry ... num_assoc_obj_ids\n",
|
||||
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n",
|
||||
"33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n",
|
||||
"131 Tägliche Überprüfung der Ölabscheider ... 4\n",
|
||||
"160 Wöchentliche Kontrolle der C-Anlagen ... 11\n",
|
||||
"140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n",
|
||||
"... ... ... ...\n",
|
||||
"2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n",
|
||||
"2558 T-Warp-Let-Off1 schleppfehler ... 1\n",
|
||||
"2557 Fahrräder wurden gewartet und gereinigt. ... 1\n",
|
||||
"2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n",
|
||||
"6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n",
|
||||
"\n",
|
||||
"[4545 rows x 5 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 89,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ret[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
|
||||
]
|
||||
},
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"file = files[-1]\n",
|
||||
"ret = load_pickle(file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>index</th>\n",
|
||||
" <th>entry</th>\n",
|
||||
" <th>len</th>\n",
|
||||
" <th>num_occur</th>\n",
|
||||
" <th>assoc_obj_ids</th>\n",
|
||||
" <th>num_assoc_obj_ids</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>162</td>\n",
|
||||
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>92592</td>\n",
|
||||
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
|
||||
" <td>206</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>33</td>\n",
|
||||
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
|
||||
" <td>39</td>\n",
|
||||
" <td>3108</td>\n",
|
||||
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
|
||||
" <td>74</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>131</td>\n",
|
||||
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
|
||||
" <td>37</td>\n",
|
||||
" <td>1619</td>\n",
|
||||
" <td>[0, 970, 2134, 2137]</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>160</td>\n",
|
||||
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>1265</td>\n",
|
||||
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
|
||||
" <td>11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>140</td>\n",
|
||||
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
|
||||
" <td>44</td>\n",
|
||||
" <td>687</td>\n",
|
||||
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
|
||||
" <td>166</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6756</th>\n",
|
||||
" <td>2559</td>\n",
|
||||
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
|
||||
" <td>46</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[211]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6757</th>\n",
|
||||
" <td>2558</td>\n",
|
||||
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
|
||||
" <td>30</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[93]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6758</th>\n",
|
||||
" <td>2557</td>\n",
|
||||
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1707]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6759</th>\n",
|
||||
" <td>2556</td>\n",
|
||||
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
|
||||
" <td>173</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>[1]</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6760</th>\n",
|
||||
" <td>6782</td>\n",
|
||||
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
|
||||
" <td>106</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>[306, 326]</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>4545 rows × 6 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" index ... num_assoc_obj_ids\n",
|
||||
"0 162 ... 206\n",
|
||||
"1 33 ... 74\n",
|
||||
"2 131 ... 4\n",
|
||||
"3 160 ... 11\n",
|
||||
"4 140 ... 166\n",
|
||||
"... ... ... ...\n",
|
||||
"6756 2559 ... 1\n",
|
||||
"6757 2558 ... 1\n",
|
||||
"6758 2557 ... 1\n",
|
||||
"6759 2556 ... 1\n",
|
||||
"6760 6782 ... 2\n",
|
||||
"\n",
|
||||
"[4545 rows x 6 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ret[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "cbf0b450-ec00-471f-9627-717e52c5471d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm.auto import tqdm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 84,
|
||||
"id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tup = tuple(i for i in range(100000000))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6ade9c6f4e61410fb93f35e43222705b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/100000000 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num = 0\n",
|
||||
"for i in tqdm(tup):\n",
|
||||
" num += i"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4999999950000000"
|
||||
]
|
||||
},
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "36366147-3632-4518-936e-878563305e49",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "17156aa0-8fd6-407b-b014-698df0e534a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"arr = np.random.rand(1000,1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"idx = np.argwhere(arr >= 0.97)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"30220"
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(idx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"id": "5b78436e-a828-42bd-a5ed-ae6045349391",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch = idx[:200]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"id": "75edc50e-b64c-4319-8f74-27653ed3452c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%timeit\n",
|
||||
"tuple(map(tuple, batch))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%timeit\n",
|
||||
"tuple(tuple(x) for x in batch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user