STRF for similarity duplicates, time analysis pipeline, enhanced config

This commit is contained in:
Florian Förster 2024-05-29 16:34:31 +02:00
parent 5d2c97165a
commit bb987e2108
30 changed files with 1875 additions and 693 deletions

View File

@ -34,3 +34,15 @@ trials = [
"plotly>=5.22.0",
"dash>=2.17.0",
]
[tool.ruff]
line-length = 94
indent-width = 4
target-version = "py311"
[tool.ruff.format]
quote-style = "single"
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = ["E", "F", "I"]

View File

@ -1,33 +1,43 @@
import typing
import warnings
from pathlib import Path
from typing import cast
from pandas import DataFrame, Series
from ihm_analyse import (
SAVE_PATH_FOLDER,
PATH_TO_DATASET,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
DO_PREPROCESSING,
DO_TOKEN_ANALYSIS,
DO_GRAPH_POSTPROCESSING,
from lang_main import (
TokenGraph,
create_saving_folder,
load_pickle,
Embedding,
Index,
TokenGraph,
)
from ihm_analyse.predefined_pipes import (
pipe_target_feat,
pipe_embds,
from lang_main.constants import (
DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING,
DO_TIME_ANALYSIS,
DO_TOKEN_ANALYSIS,
INPUT_PATH_FOLDER,
PATH_TO_DATASET,
SAVE_PATH_FOLDER,
SKIP_GRAPH_POSTPROCESSING,
SKIP_PREPROCESSING,
SKIP_TIME_ANALYSIS,
SKIP_TOKEN_ANALYSIS,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
)
# Embedding,
# PandasIndex,
from lang_main.pipelines.predefined import (
pipe_merge,
pipe_target_feat,
pipe_timeline,
pipe_token_analysis,
)
"""
# ** config parameters
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters']
"""
from lang_main.types import (
ObjectID,
TimelineCandidates,
)
from pandas import DataFrame, Series
# ** processing pipeline
def run_preprocessing() -> DataFrame:
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
overwrite_existing=True,
)
# run pipelines
ret = typing.cast(tuple[DataFrame],
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)))
ret = typing.cast(
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
)
target_feat_data = ret[0]
# only entries with more than threshold amount of characters
data_filter = typing.cast(Series,
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
dupl_idx_pairs, embds = typing.cast(
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]],
pipe_embds.run(starting_values=(subset_data,))
)
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
# dupl_idx_pairs, embds = typing.cast(
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
# pipe_embds.run(starting_values=(subset_data,)),
# )
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame],
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)))
subset_data = target_feat_data.loc[data_filter].copy()
ret = typing.cast(
tuple[DataFrame],
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
pipe_merge.run(starting_values=(subset_data,)),
)
preprocessed_data = ret[0]
return preprocessed_data
def run_token_analysis(
preprocessed_data: DataFrame,
) -> TokenGraph:
# build token graph
(tk_graph,) = typing.cast(tuple[TokenGraph],
pipe_token_analysis.run(starting_values=(preprocessed_data,)))
(tk_graph,) = typing.cast(
tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
)
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER,
filename=f'{pipe_token_analysis.name}-TokenGraph')
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph
def run_graph_postprocessing(
tk_graph: TokenGraph,
) -> TokenGraph:
# filter graph by edge weight and remove single nodes (no connection)
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER,
filename='TokenGraph-filtered',
directed=False)
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER,
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered')
tk_graph_filtered.save_graph(
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
)
tk_graph_filtered.to_pickle(
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
)
return tk_graph_filtered
if __name__ == '__main__':
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
filename = 'without_nan'
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
ret = load_pickle(loading_path)
preprocessed_data = ret[0]
ret = cast(
tuple[TimelineCandidates, dict[ObjectID, str]],
pipe_timeline.run(starting_values=(preprocessed_data,)),
)
return ret
def verify_path(
loading_path: Path,
) -> None:
if not loading_path.exists():
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
def main() -> None:
pre_step_skipped: bool = False
# ** preprocess
if DO_PREPROCESSING:
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
preprocessed_data = run_preprocessing()
else:
elif not SKIP_PREPROCESSING:
# !! hardcoded result filenames
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
ret = typing.cast(tuple[DataFrame],
load_pickle(target_filepath))
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
verify_path(loading_path)
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
preprocessed_data = ret[0]
# ** token analysis
if DO_TOKEN_ANALYSIS:
preprocessed_data_trunc = typing.cast(DataFrame,
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
else:
pre_step_skipped = True
warnings.warn('No preprocessing action selected. Skipped.')
# sys.exit(0)
# ** token analysis
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
if pre_step_skipped:
raise RuntimeError(
'Preprocessing step skipped. Token analysis cannot be performed.'
)
preprocessed_data_trunc = typing.cast(
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
elif not SKIP_TOKEN_ANALYSIS:
# !! hardcoded result filenames
# whole graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
#tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph = TokenGraph.from_pickle(loading_path)
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING:
tk_graph_filtered = run_graph_postprocessing(tk_graph)
pre_step_skipped = False
else:
pre_step_skipped = True
warnings.warn('No token analysis action selected. Skipped.')
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
if pre_step_skipped:
raise RuntimeError(
(
'Preprocessing or token analysis step skipped. '
'Graph postprocessing cannot be performed.'
)
)
tk_graph_filtered = run_graph_postprocessing(tk_graph)
elif not SKIP_GRAPH_POSTPROCESSING:
# !! hardcoded result filenames
# filtered graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle')
#tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
warnings.warn('No graph postprocessing action selected. Skipped.')
# ** time analysis
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
# no check for fails, runs separately
ret = run_time_analysis()
elif not SKIP_TIME_ANALYSIS:
...
else:
warnings.warn('No time analysis action selected. Skipped.')
if __name__ == '__main__':
main()

Binary file not shown.

View File

@ -1,17 +1,21 @@
# lang_main: Config file
[paths]
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = false
token_analysis = true
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = true
graph_postprocessing = false
graph_postprocessing_skip = true
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'

View File

@ -0,0 +1,59 @@
# lang_main: Config file
[paths]
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = true
token_analysis = false
token_analysis_skip = true
graph_postprocessing = false
graph_postprocessing_skip = true
time_analysis = true
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

12
scripts/test.py Normal file
View File

@ -0,0 +1,12 @@
from lang_main.analysis.preprocessing import clean_string_slim
from lang_main.constants import SAVE_PATH_FOLDER
print(SAVE_PATH_FOLDER)
txt = """
Wir feiern den Jahrestag, olé!
tel:::: !!!!???? +++49 123 456 789
Doch leben wir länger.
"""
print(txt)
print(clean_string_slim(txt))

View File

@ -1,18 +1,19 @@
from typing import Final, Any
import inspect
import sys
import logging
from time import gmtime
import shutil
import sys
from pathlib import Path
from time import gmtime
from typing import Any, Final
from lang_main.shared import (
save_pickle,
load_pickle,
create_saving_folder,
load_toml_config,
)
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.analysis.graphs import TokenGraph
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.shared import (
create_saving_folder,
load_pickle,
load_toml_config,
save_pickle,
)
__all__ = [
'save_pickle',
@ -32,37 +33,30 @@ logging.basicConfig(
datefmt=LOG_DATE_FMT,
)
USE_INTERNAL_CONFIG: Final[bool] = True
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
# load config data: internal/external
if USE_INTERNAL_CONFIG:
curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore
pkg_dir = curr_file_dir.parent
config_path = Path(pkg_dir, 'config.toml')
loaded_config = load_toml_config(path_to_toml=config_path)
CONFIG: Final[dict[str, Any]] = loaded_config.copy()
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
raise NotImplementedError("External config data not implemented yet.")
caller_file = Path(inspect.stack()[-1].filename)
if not caller_file.exists():
raise FileNotFoundError('Caller file could not be correctly retrieved.')
cfg_path_external = caller_file.parent / CONFIG_FILENAME
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(
(
'No config file was found. A new one with default values was created '
'in the execution path. Please fill in the necessary values and '
'restart the programm.'
)
)
# raise NotImplementedError("External config data not implemented yet.")
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
# ** paths
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
# ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
# ** export
# ** preprocessing
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
CONFIG['preprocess']['filename_cossim_filter_candidates']
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
CONFIG['preprocess']['threshold_amount_characters']
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
# ** token analysis
# ** graph postprocessing
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
# ** time analysis
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()

View File

@ -1,18 +1,18 @@
import typing
from typing import Any, Self, Literal, overload, Final
import sys
from collections.abc import Hashable
from pathlib import Path
import copy
import sys
import typing
from collections.abc import Hashable, Iterable
from pathlib import Path
from typing import Any, Final, Literal, Self, overload
import networkx as nx
import numpy as np
import numpy.typing as npt
from networkx import Graph, DiGraph
import networkx as nx
from networkx import DiGraph, Graph
from pandas import DataFrame
from lang_main.loggers import logger_graphs as logger
from lang_main.shared import save_pickle, load_pickle
from lang_main.shared import load_pickle, save_pickle
# TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False
@ -31,8 +31,7 @@ def get_graph_metadata(
min_edge_weight: int = 1_000_000
max_edge_weight: int = 0
for edge in graph.edges:
weight = typing.cast(int,
graph[edge[0]][edge[1]]['weight'])
weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
if weight < min_edge_weight:
min_edge_weight = weight
if weight > max_edge_weight:
@ -54,18 +53,20 @@ def get_graph_metadata(
)
if logging:
logger.info((f"Graph properties: {num_nodes} Nodes, "
f"{num_edges} Edges"))
logger.info(f"Node memory: {node_mem / 1024:.2f} KB")
logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB")
logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
return graph_info
def update_graph(
graph: Graph | DiGraph,
parent: Hashable,
child: Hashable,
*,
batch: Iterable[tuple[Hashable, Hashable]] | None = None,
parent: Hashable | None = None,
child: Hashable | None = None,
weight_connection: int = 1,
) -> None:
# !! not necessary to check for existence of nodes
@ -78,7 +79,9 @@ def update_graph(
graph.add_node(child)
"""
# check if edge not in Graph
if not graph.has_edge(parent, child):
if batch is not None:
graph.add_edges_from(batch, weight=weight_connection)
elif not graph.has_edge(parent, child):
# create new edge, nodes will be created if not already present
graph.add_edge(parent, child, weight=weight_connection)
else:
@ -87,40 +90,38 @@ def update_graph(
weight += weight_connection
graph[parent][child]['weight'] = weight
# build undirected adjacency matrix
def convert_graph_to_undirected(
graph: DiGraph,
logging: bool = LOGGING_DEFAULT,
) -> Graph:
# get adjacency matrix
adj_mat = typing.cast(DataFrame,
nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
arr = typing.cast(npt.NDArray[np.uint32],
adj_mat.to_numpy())
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
# build undirected array: adding edges of lower triangular matrix to upper one
arr_upper = np.triu(arr)
arr_lower = np.tril(arr)
arr_lower = np.rot90(np.fliplr(arr_lower))
arr_new = arr_upper + arr_lower
# assign new data and create graph
adj_mat.loc[:] = arr_new # type: ignore
graph_undir = typing.cast(Graph,
nx.from_pandas_adjacency(df=adj_mat))
adj_mat.loc[:] = arr_new # type: ignore
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
# info about graph
if logging:
logger.info("Successfully converted graph to one with undirected edges.")
logger.info('Successfully converted graph to one with undirected edges.')
_ = get_graph_metadata(graph=graph_undir, logging=logging)
return graph_undir
class TokenGraph(DiGraph):
class TokenGraph(DiGraph):
def __init__(
self,
name: str = 'TokenGraph',
enable_logging: bool = True,
incoming_graph_data: Any| None = None,
incoming_graph_data: Any | None = None,
**attr,
) -> None:
super().__init__(incoming_graph_data, **attr)
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
return self.__str__()
def __str__(self) -> str:
return (f"TokenGraph(name: {self.name}, number of nodes: "
f"{len(self.nodes)}, number of edges: "
f"{len(self.edges)})")
return (
f'TokenGraph(name: {self.name}, number of nodes: '
f'{len(self.nodes)}, number of edges: '
f'{len(self.edges)})'
)
# !! only used to verify that saving was done correctly
"""
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
self,
inplace: Literal[True] = ...,
logging: bool | None = ...,
) -> None:
...
) -> None: ...
@overload
def to_undirected(
self,
inplace: Literal[False],
logging: bool | None = ...,
) -> Graph:
...
) -> Graph: ...
@overload
def to_undirected(
self,
inplace: bool = ...,
logging: bool | None = ...
) -> Graph | None:
...
self, inplace: bool = ..., logging: bool | None = ...
) -> Graph | None: ...
def to_undirected(
self,
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
if logging is None:
logging = self.logging
self._undirected = convert_graph_to_undirected(graph=self,
logging=logging)
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
logging=logging)
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
self._metadata_undirected = get_graph_metadata(
graph=self._undirected, logging=logging
)
if not inplace:
return self._undirected
@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
if logging is None:
logging = self.logging
self._metadata_directed = get_graph_metadata(graph=self,
logging=logging)
self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
if self._undirected is not None:
self._metadata_undirected = get_graph_metadata(graph=self._undirected,
logging=logging)
self._metadata_undirected = get_graph_metadata(
graph=self._undirected, logging=logging
)
def filter_by_edge_weight(
self,
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
filtered_graph = self.copy()
for edge in original_graph_edges:
weight = typing.cast(int,
filtered_graph[edge[0]][edge[1]]['weight'])
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
if weight < threshold:
filtered_graph.remove_edge(edge[0], edge[1])
@ -287,7 +284,7 @@ class TokenGraph(DiGraph):
filtered_graph = self.copy()
for node in original_graph_nodes:
degree = filtered_graph.degree[node] # type: ignore
degree = filtered_graph.degree[node] # type: ignore
if degree < threshold:
filtered_graph.remove_node(node)
@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
filename: str | None = None,
) -> Path:
if filename is not None:
saving_path = path.joinpath(f"{filename}")
saving_path = path.joinpath(f'{filename}')
else:
saving_path = path.joinpath(f"{self.name}")
saving_path = path.joinpath(f'{self.name}')
return saving_path
@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
elif not directed and self._undirected is not None:
target_graph = self._undirected
else:
raise ValueError("No undirected graph available.")
raise ValueError('No undirected graph available.')
saving_path = saving_path.with_suffix('.graphml')
nx.write_graphml(G=target_graph, path=saving_path)
logger.info(("Successfully saved graph as GraphML file "
f"under {saving_path}."))
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
def to_pickle(
self,
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
match path.suffix:
case '.graphml':
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
logger.info(f"Successfully loaded graph from GraphML file {path}.")
logger.info(f'Successfully loaded graph from GraphML file {path}.')
case '.pkl' | '.pickle':
graph = typing.cast(Self, load_pickle(path))
logger.info(f"Successfully loaded graph from pickle file {path}.")
logger.info(f'Successfully loaded graph from pickle file {path}.')
case _:
raise ValueError("File format not supported.")
raise ValueError('File format not supported.')
return graph
@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
path = Path(path)
if path.suffix not in ('.pkl', '.pickle'):
raise ValueError("File format not supported.")
raise ValueError('File format not supported.')
graph = typing.cast(Self, load_pickle(path))

View File

@ -1,29 +1,29 @@
from typing import cast, Callable
import re
from collections.abc import Iterable
from itertools import combinations
import re
from math import factorial
from pathlib import Path
from typing import Callable, cast
import numpy as np
from torch import Tensor
from pandas import DataFrame, Series
import pandas as pd
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from sentence_transformers import SentenceTransformer
import sentence_transformers
import sentence_transformers.util
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
from tqdm import tqdm
from lang_main.types import Embedding, PandasIndex
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.shared import (
candidates_by_index,
similar_index_connection_graph,
similar_index_groups,
)
#from lang_main.analysis.graphs import update_graph, get_graph_metadata
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.types import Embedding, PandasIndex
# ** (1) dataset preparation: loading and simple preprocessing
@ -67,11 +67,16 @@ def load_raw_data(
parse_dates=date_cols,
dayfirst=True,
)
logger.info("Loaded dataset successfully.")
logger.info((f"Dataset properties: number of entries: {len(data)}, "
f"number of features {len(data.columns)}"))
logger.info('Loaded dataset successfully.')
logger.info(
(
f'Dataset properties: number of entries: {len(data)}, '
f'number of features {len(data.columns)}'
)
)
return (data,)
def remove_duplicates(
data: DataFrame,
) -> tuple[DataFrame]:
@ -89,7 +94,7 @@ def remove_duplicates(
"""
# obtain info about duplicates over all features
duplicates_filt = data.duplicated()
logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}")
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
# drop duplicates
wo_duplicates = data.drop_duplicates(ignore_index=True)
duplicates_subset: list[str] = [
@ -97,16 +102,26 @@ def remove_duplicates(
'ObjektID',
]
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
logger.info(("Number of duplicates over subset "
f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}"))
wo_duplicates =\
wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy()
logger.info("Removed all duplicates from dataset successfully.")
logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, "
f"number of features {len(wo_duplicates.columns)}"))
logger.info(
(
'Number of duplicates over subset '
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
)
)
wo_duplicates = wo_duplicates.drop_duplicates(
subset=duplicates_subset, ignore_index=True
).copy()
logger.info('Removed all duplicates from dataset successfully.')
logger.info(
(
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
f'number of features {len(wo_duplicates.columns)}'
)
)
return (wo_duplicates,)
def remove_NA(
data: DataFrame,
target_features: list[str] = [
@ -127,16 +142,17 @@ def remove_NA(
DataFrame
dataset with removed NA entries for given subset of features
"""
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.")
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
)
return (wo_NA,)
# ** (2) entry-based cleansing
# following functions clean and prepare specific entries, not whole dataset
def clean_string_slim(
string: str
) -> str:
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
@ -151,13 +167,16 @@ def clean_string_slim(
cleaned entry
"""
# remove special chars
pattern = r'[\t\n\r\f\v]'
pattern = r'[\t\n\r\f\v]+'
string = re.sub(pattern, ' ', string)
pattern = r'([,;.:!?-_\+]){2,}'
# remove whitespaces at the beginning and the end
string = re.sub(pattern, r'\1', string)
string = string.strip()
return string
def entry_wise_cleansing(
data: DataFrame,
target_feature: str,
@ -165,10 +184,16 @@ def entry_wise_cleansing(
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func)
logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< "
f"for feature >>{target_feature}<<"))
logger.info(
(
f'Successfully applied entry-wise cleansing procedure '
f'>>{cleansing_func.__name__}<< '
f'for feature >>{target_feature}<<'
)
)
return (data,)
# ** in-depth analysis of one feature
# following functions try to gain insights on a given feature of the IHM dataset such
# as number of occurrences or associated Object IDs
@ -178,7 +203,7 @@ def analyse_feature(
) -> tuple[DataFrame]:
# feature columns
feature_entries = data[target_feature]
logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}")
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
# obtain unique entries
unique_feature_entries = feature_entries.unique()
@ -186,7 +211,7 @@ def analyse_feature(
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.):
for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry)
filt = data[target_feature] == entry
temp = data[filt]
@ -195,13 +220,10 @@ def analyse_feature(
num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum()
conc_df = pd.DataFrame(data=[[
entry,
len_entry,
num_dupl,
assoc_obj_ids,
num_assoc_obj_ids
]], columns=cols)
conc_df = pd.DataFrame(
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
columns=cols,
)
result_df = pd.concat([result_df, conc_df], ignore_index=True)
@ -230,9 +252,9 @@ def build_embedding_map(
is_STRF = True
if not any((is_spacy, is_STRF)):
raise NotImplementedError("Model type unknown")
raise NotImplementedError('Model type unknown')
for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.):
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
# verbose code: Pyright not inferring types correctly
idx = cast(int, idx)
text = cast(str, text)
@ -246,12 +268,17 @@ def build_embedding_map(
logger.debug(f'{embd.text=} has no vector')
elif is_STRF:
model = cast(SentenceTransformer, model)
embd = cast(Tensor,
model.encode(text, show_progress_bar=False))
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
embeddings[idx] = (embd, text)
return embeddings, (is_spacy, is_STRF)
# adapt interface
# use candidates by index function
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
# build similarity matrix out of embeddings
def build_cosSim_matrix(
data: Series,
@ -259,10 +286,11 @@ def build_cosSim_matrix(
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
# build empty matrix
df_index = data.index
cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index,
index=df_index, dtype=np.float32)
cosineSim_idx_matrix = pd.DataFrame(
data=0.0, columns=df_index, index=df_index, dtype=np.float32
)
logger.info("Start building embedding map...")
logger.info('Start building embedding map...')
# obtain embeddings based on used model
embds, (is_spacy, is_STRF) = build_embedding_map(
@ -270,16 +298,16 @@ def build_cosSim_matrix(
model=model,
)
logger.info("Embedding map built successfully.")
logger.info('Embedding map built successfully.')
# apply index based mapping for efficient handling of large texts
combs = combinations(df_index, 2)
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index)-2)
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
logger.info("Start calculation of similarity scores...")
logger.info('Start calculation of similarity scores...')
for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.):
#print(f"{idx1=}, {idx2=}")
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
# print(f"{idx1=}, {idx2=}")
embd1 = embds[idx1][0]
embd2 = embds[idx2][0]
@ -296,10 +324,11 @@ def build_cosSim_matrix(
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
logger.info("Similarity scores calculated successfully.")
logger.info('Similarity scores calculated successfully.')
return cosineSim_idx_matrix, embds
# obtain index pairs with cosine similarity
# greater than or equal to given threshold value
def filt_thresh_cosSim_matrix(
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
Series
series with multi index (index pairs) and corresponding similarity score
"""
cosineSim_filt = cast(Series,
cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack())
cosineSim_filt = cast(
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
)
return cosineSim_filt, embds
def list_cosSim_dupl_candidates(
cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]],
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
list containing relevant index pairs for entries with similarity score greater than
given threshold
"""
logger.info("Start gathering of similarity candidates...")
logger.info('Start gathering of similarity candidates...')
# compare found duplicates
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
# get text content from embedding as second tuple entry
content = [[
idx1,
embds[idx1][1],
idx2,
embds[idx2][1],
score,
]]
content = [
[
idx1,
embds[idx1][1],
idx2,
embds[idx2][1],
score,
]
]
# add candidates to collection DataFrame
df_conc = pd.DataFrame(columns=columns, data=content)
if df_candidates.empty:
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
# save index pairs
index_pairs.append((idx1, idx2))
logger.info("Similarity candidates gathered successfully.")
logger.info('Similarity candidates gathered successfully.')
if save_candidates:
if saving_path is None:
raise ValueError(("Saving path must be provided if duplicate "
"candidates should be saved."))
raise ValueError(
('Saving path must be provided if duplicate ' 'candidates should be saved.')
)
elif pipeline is not None:
target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_'
+ filename + '.xlsx')
target_filename = (
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
)
elif pipeline is None:
target_filename = f'{filename}.xlsx'
logger.info("Saving similarity candidates...")
logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path)
logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.")
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
return index_pairs, embds
# TODO: change implementation fully to SentenceTransformer
# usage of batch processing for embeddings, use candidate idx function
# from time analysis --> moved to ``helpers.py``
@ -419,20 +455,28 @@ def similar_ids_groups(
yield list(id_group)
"""
def merge_similarity_dupl(
data: DataFrame,
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
model: SentenceTransformer,
cos_sim_threshold: float,
) -> tuple[DataFrame]:
logger.info("Start merging of similarity candidates...")
logger.info('Start merging of similarity candidates...')
# data
merged_data = data.copy()
model_input = merged_data['entry']
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs)
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group,:]
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
@ -454,10 +498,11 @@ def merge_similarity_dupl(
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info("Similarity candidates merged successfully.")
logger.info('Similarity candidates merged successfully.')
return (merged_data.copy(),)
# merge duplicates
def merge_similarity_dupl_old(
data: DataFrame,
@ -466,11 +511,10 @@ def merge_similarity_dupl_old(
# copy pre-cleaned data
temp = data.copy()
index = temp.index
#logger.info("Start merging of similarity candidates...")
# logger.info("Start merging of similarity candidates...")
# iterate over index pairs
for (i1, i2) in tqdm(dupl_idx_pairs):
for i1, i2 in tqdm(dupl_idx_pairs):
# if an entry does not exist any more, skip this pair
if i1 not in index or i2 not in index:
continue
@ -498,7 +542,7 @@ def merge_similarity_dupl_old(
temp = temp.drop(index=i2)
index = temp.index
#logger.info("Similarity candidates merged successfully.")
# logger.info("Similarity candidates merged successfully.")
return (temp,)
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
given threshold
"""
# compare found duplicates
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
# get texts for comparison
text1 = embds[idx1][1]
text2 = embds[idx2][1]
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
continue
# get text content from embedding as second tuple entry
content = [[
idx1,
text1,
idx2,
text2,
score,
]]
content = [
[
idx1,
text1,
idx2,
text2,
score,
]
]
df_conc = pd.DataFrame(columns=columns, data=content)
df_candidates = pd.concat([df_candidates, df_conc])

View File

@ -1,11 +1,71 @@
from typing import cast
from collections.abc import Iterable, Iterator
from typing import cast
import networkx as nx
import numpy as np
import numpy.typing as npt
import sentence_transformers
import sentence_transformers.util
from networkx import Graph
from pandas import Series
from sentence_transformers import SentenceTransformer
from torch import Tensor
from tqdm.auto import tqdm
from lang_main.analysis.graphs import get_graph_metadata, update_graph
from lang_main.types import PandasIndex
from lang_main.analysis.graphs import update_graph, get_graph_metadata
def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
"""function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
ObjectID and tuple of index pairs which meet the cosine
similarity threshold
"""
# embeddings
batch = cast(list[str], data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
),
)
# cosine similarity
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
np.fill_diagonal(cos_sim, 0.0)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair
def similar_index_connection_graph(
@ -15,21 +75,21 @@ def similar_index_connection_graph(
# use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset
similar_id_graph = nx.Graph()
for (idx1, idx2) in similar_idx_pairs:
# inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
# for idx1, idx2 in similar_idx_pairs:
# # inplace operation, parent/child do not really exist in undirected graph
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info
# TODO check returning tuple
def similar_index_groups(
similar_id_graph: Graph,
) -> Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]],
nx.connected_components(G=similar_id_graph))
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
for id_group in ids_groups:
yield tuple(id_group)

View File

@ -1,21 +1,17 @@
from typing import cast
from collections.abc import Iterable, Iterator
from typing import cast
import numpy as np
import numpy.typing as npt
from pandas import DataFrame, Series
from torch import Tensor
from sentence_transformers import SentenceTransformer
import sentence_transformers
import sentence_transformers.util
from tqdm.auto import tqdm # TODO: check deletion
from tqdm.auto import tqdm # TODO: check deletion
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
from lang_main.loggers import logger_timeline as logger
from lang_main.analysis.shared import (
candidates_by_index,
similar_index_connection_graph,
similar_index_groups,
)
from lang_main.loggers import logger_timeline as logger
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
def non_relevant_obj_ids(
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID',
) -> tuple[ObjectID, ...]:
data = data.copy()
ids_to_ignore: set[ObjectID] = set()
obj_ids = cast(Iterable[ObjectID], # actually NumPy array
data[feature_obj_id].unique())
obj_ids = cast(
Iterable[ObjectID], # actually NumPy array
data[feature_obj_id].unique(),
)
for obj_id in obj_ids:
feats_per_obj_id = cast(
Series,
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
)
# check for uniqueness of given feature for current ObjectID
# ignore NaN values
@ -46,14 +42,15 @@ def non_relevant_obj_ids(
return tuple(ids_to_ignore)
def remove_non_relevant_obj_ids(
data: DataFrame,
thresh_unique_feat_per_id: int,
*,
feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID',
) -> DataFrame:
logger.info("Removing non-relevant ObjectIDs from dataset")
) -> tuple[DataFrame]:
logger.info('Removing non-relevant ObjectIDs from dataset')
data = data.copy()
ids_to_ignore = non_relevant_obj_ids(
data=data,
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
)
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}")
logger.info("Non-relevant ObjectIDs removed successfully")
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
logger.info('Non-relevant ObjectIDs removed successfully')
return data
return (data,)
def filter_activities_per_obj_id(
data: DataFrame,
activity_feature: str = 'VorgangsTypName',
relevant_activity_types: Iterable[str] = (
'Reparaturauftrag (Portal)',
),
feature_obj_id: str = 'ObjektID',
threshold_num_activities: int = 1,
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
logger.info("Filtering activities per ObjectID")
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series,
data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
# index of series contains ObjectIDs
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
.isin(obj_ids_below_thresh))
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info("Activities per ObjectID filtered successfully")
return data_filter_activities, num_activities_per_obj_id
def generate_model_input(
data: DataFrame,
@ -107,8 +74,8 @@ def generate_model_input(
'VorgangsArtText',
'VorgangsBeschreibung',
),
) -> DataFrame:
logger.info("Generating concatenation of model input features")
) -> tuple[DataFrame]:
logger.info('Generating concatenation of model input features')
data = data.copy()
model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str)
@ -116,9 +83,40 @@ def generate_model_input(
lambda x: ' - '.join(x),
axis=1,
)
logger.info("Model input generated successfully")
logger.info('Model input generated successfully')
return (data,)
def filter_activities_per_obj_id(
data: DataFrame,
activity_feature: str = 'VorgangsTypName',
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
feature_obj_id: str = 'ObjektID',
threshold_num_activities: int = 1,
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
logger.info('Filtering activities per ObjectID')
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
# index of series contains ObjectIDs
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
obj_ids_below_thresh
)
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info('Activities per ObjectID filtered successfully')
return data_filter_activities, num_activities_per_obj_id
return data
# for each obj_id in relevant_obj_ids
## filter data for obj_id
@ -130,6 +128,7 @@ def generate_model_input(
## obtain idx pairs, yield
## use idx pairs to get idx values of series
def get_timeline_candidates_index(
data: DataFrame,
num_activities_per_obj_id: Series,
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
model_input_feature: str = 'nlp_model_input',
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
# already sorted ObjIDs (descending regarding number of activities)
obj_ids = cast(Iterable[ObjectID],
num_activities_per_obj_id.index)
obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
for obj_id in tqdm(obj_ids):
data_per_obj_id = cast(
DataFrame,
data.loc[data[feature_obj_id]==obj_id]
)
data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
data_model_input = data_per_obj_id[model_input_feature]
candidates_idx = candidates_by_index(
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
cos_sim_threshold=cos_sim_threshold,
)
# directly process candidates
candidates_idx = tuple(candidates_idx)
# candidates_idx = tuple(candidates_idx)
similar_id_graph, _ = similar_index_connection_graph(
similar_idx_pairs=candidates_idx,
)
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
for index_group in similar_index_groups(similar_id_graph):
yield obj_id, index_group
# TODO: check application for duplicate removal
def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
"""function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
ObjectID and tuple of index pairs which meet the cosine
similarity threshold
"""
# embeddings
batch = cast(list[str],
data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
)
)
# cosine similarity
cos_sim = cast(
npt.NDArray,
sentence_transformers.util.cos_sim(embds, embds).numpy()
)
np.fill_diagonal(cos_sim, 0.)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64],
tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair
def transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates:
@ -259,20 +199,52 @@ def transform_timeline_candidates(
return candidates_by_obj_id
def map_obj_texts(
def map_obj_id_to_texts(
data: DataFrame,
obj_ids: Iterable[ObjectID],
feature_obj_id: str = 'ObjektID',
) -> dict[ObjectID, str]:
data = data.copy()
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
obj_id_to_text: dict[ObjectID, str] = {}
for obj_id in obj_ids:
data_per_obj = cast(
DataFrame,
data.loc[data['ObjektID']==obj_id]
)
for obj_id in tqdm(obj_ids):
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
# just take first entry
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
obj_text = obj_text.strip(r' ,.:')
obj_id_to_text[obj_id] = obj_text
return obj_id_to_text
def get_timeline_candidates(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
model: SentenceTransformer,
cos_sim_threshold: float,
feature_obj_id: str = 'ObjektID',
model_input_feature: str = 'nlp_model_input',
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
logger.info('Obtaining timeline candidates...')
candidates = get_timeline_candidates_index(
data=data,
num_activities_per_obj_id=num_activities_per_obj_id,
model=model,
cos_sim_threshold=cos_sim_threshold,
feature_obj_id=feature_obj_id,
model_input_feature=model_input_feature,
)
tl_candidates = transform_timeline_candidates(candidates)
logger.info('Timeline candidates obtained successfully.')
# text mapping to obtain object descriptors
logger.info('Mapping ObjectIDs to their respective text descriptor...')
map_obj_text = map_obj_id_to_texts(
data=data,
feature_obj_id=feature_obj_id,
)
logger.info('ObjectIDs successfully mapped to text descriptors.')
return tl_candidates, map_obj_text

View File

@ -1,56 +1,56 @@
from typing import cast
import re
from itertools import combinations
from collections.abc import Iterator
from itertools import combinations
from typing import cast
from dateutil.parser import parse
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.tokens.token import Token as SpacyToken
from tqdm.auto import tqdm
from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import (
update_graph,
TokenGraph,
update_graph,
)
from lang_main.loggers import logger_token_analysis as logger
# ** Logging
#LOGGING_LEVEL = 'INFO'
#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
#logger = logging.getLogger('ihm_analyse.token_analysis')
# LOGGING_LEVEL = 'INFO'
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
# logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
# ** TAG
#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
TAG_OF_INTEREST: frozenset[str] = frozenset()
# ** obtaining connection in texts
def pre_clean_word(string: str) -> str:
def pre_clean_word(string: str) -> str:
pattern = r'[^A-Za-zäöüÄÖÜ]+'
string = re.sub(pattern, '', string)
return string
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
def is_str_date(
string: str,
fuzzy: bool = False,
) -> bool:
#print(string)
# print(string)
try:
# check if string is a number
# if length is greater than 8, it is not a date
@ -67,10 +67,10 @@ def is_str_date(
except ValueError:
return False
def obtain_relevant_descendants(
token: SpacyToken,
) -> Iterator[SpacyToken]:
for descendant in token.subtree:
# subtrees contain the token itself
# if current element is token skip this element
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
if is_str_date(string=descendant.text):
continue
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant "
f">>{descendant}<<, POS >>{descendant.pos_}<<"))
logger.debug(
(
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
)
)
# eliminate cases of cross-references with verbs
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')):
if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
):
continue
# skip cases in which descendant is indirect POS with others than verbs
elif descendant.pos_ in POS_INDIRECT:
@ -99,6 +104,7 @@ def obtain_relevant_descendants(
# TODO look at results and fine-tune function accordingly
def add_doc_info_to_graph(
graph: TokenGraph,
doc: SpacyDoc,
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
graph=graph,
parent=token.lemma_,
child=descendant.lemma_,
weight_connection=weight
weight_connection=weight,
)
else:
# if indirect POS, make connection between all associated words
@ -139,12 +145,13 @@ def add_doc_info_to_graph(
weight_connection=weight,
)
def build_token_graph(
data: DataFrame,
model: GermanSpacyModel,
) -> tuple[TokenGraph]:
# empty NetworkX directed graph
#graph = nx.DiGraph()
# graph = nx.DiGraph()
graph = TokenGraph()
for row in tqdm(data.itertuples(), total=len(data)):

View File

@ -0,0 +1,55 @@
from pathlib import Path
from typing import Final
from lang_main import CONFIG
# ** paths
INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
# ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** export
# ** preprocessing
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
'filename_cossim_filter_candidates'
]
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
'threshold_amount_characters'
]
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
# ** token analysis
# ** graph postprocessing
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
# ** time analysis.uniqueness
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
'threshold_unique_texts'
]
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
'criterion_feature'
]
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
# ** time_analysis.model_input
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
CONFIG['time_analysis']['model_input']['input_features']
)
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
CONFIG['time_analysis']['model_input']['activity_types']
)
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
'threshold_num_acitivities'
]
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
'threshold_similarity'
]

View File

@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -1,5 +1,5 @@
from typing import Final
import logging
from typing import Final
from lang_main.types import LoggingLevels

View File

@ -1,20 +1,18 @@
from typing import Any
#from types import FunctionType
import sys
import logging
from collections.abc import Callable
from pathlib import Path
from typing import Any
from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import save_pickle, load_pickle
from lang_main.shared import load_pickle, save_pickle
# ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline"""
class BasePipeline():
class BasePipeline:
def __init__(
self,
name: str,
@ -27,6 +25,8 @@ class BasePipeline():
self.name = name
# working directory for pipeline == output path
self.working_dir = working_dir
# if not self.working_dir.exists():
# self.working_dir.mkdir(parents=True)
# container for actions to perform during pass
self.actions: list[Callable] = []
@ -39,8 +39,10 @@ class BasePipeline():
self._intermediate_result: Any | None = None
def __repr__(self) -> str:
return (f"{self.__class__.__name__}(name: {self.name}, "
f"working dir: {self.working_dir}, contents: {self.action_names})")
return (
f'{self.__class__.__name__}(name: {self.name}, '
f'working dir: {self.working_dir}, contents: {self.action_names})'
)
@property
def intermediate_result(self) -> Any:
@ -53,15 +55,16 @@ class BasePipeline():
save_result: bool = False,
) -> None:
# check explicitly for function type
#if isinstance(action, FunctionType):
# if isinstance(action, FunctionType):
if isinstance(action, Callable):
self.actions.append(action)
self.action_names.append(action.__name__)
self.actions_kwargs.append(action_kwargs.copy())
self.is_save_result.append(save_result)
else:
raise TypeError(("Action must be custom function, "
f"but is of type >>{type(action)}<<."))
raise TypeError(
f'Action must be custom function, but is of type >>{type(action)}<<.'
)
# TODO: add multiple entries by utilising simple add method
"""
@ -107,13 +110,14 @@ class BasePipeline():
return data
def prep_run(self) -> None:
logger.info(f"Starting processing pipeline >>{self.name}<<...")
logger.info(f'Starting processing pipeline >>{self.name}<<...')
# progress tracking
self.curr_proc_idx = 1
# check if performable actions available
if len(self.actions) == 0:
raise NoPerformableActionError(("The pipeline does not contain any "
"performable actions."))
raise NoPerformableActionError(
('The pipeline does not contain any ' 'performable actions.')
)
def run(
self,
@ -135,6 +139,6 @@ class BasePipeline():
# processing tracking
self.curr_proc_idx += 1
logger.info(f"Processing pipeline >>{self.name}<< successfully ended.")
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
return ret

View File

@ -1,57 +1,144 @@
from sentence_transformers import SentenceTransformer
import spacy
from sentence_transformers import SentenceTransformer
from lang_main import (
SAVE_PATH_FOLDER,
DATE_COLS,
FILENAME_COSSIM_FILTER_CANDIDATES,
THRESHOLD_SIMILARITY,
)
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.preprocessing import (
load_raw_data,
remove_duplicates,
remove_NA,
analyse_feature,
clean_string_slim,
entry_wise_cleansing,
analyse_feature,
build_cosSim_matrix,
filt_thresh_cosSim_matrix,
list_cosSim_dupl_candidates,
load_raw_data,
merge_similarity_dupl,
remove_duplicates,
remove_NA,
)
from lang_main.analysis.timeline import (
filter_activities_per_obj_id,
generate_model_input,
get_timeline_candidates,
remove_non_relevant_obj_ids,
)
from lang_main.analysis.tokens import build_token_graph
from lang_main.constants import (
ACTIVITY_FEATURE,
ACTIVITY_TYPES,
DATE_COLS,
FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES,
SAVE_PATH_FOLDER,
THRESHOLD_NUM_ACTIVITIES,
THRESHOLD_SIMILARITY,
THRESHOLD_TIMELINE_SIMILARITY,
THRESHOLD_UNIQUE_TEXTS,
UNIQUE_CRITERION_FEATURE,
)
from lang_main.pipelines.base import BasePipeline
# ** pipeline configuration
# ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS})
pipe_target_feat.add(
load_raw_data,
{
'date_cols': DATE_COLS,
},
)
pipe_target_feat.add(remove_duplicates)
pipe_target_feat.add(remove_NA, save_result=True)
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim})
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True)
pipe_target_feat.add(
entry_wise_cleansing,
{
'target_feature': 'VorgangsBeschreibung',
'cleansing_func': clean_string_slim,
},
)
pipe_target_feat.add(
analyse_feature,
{
'target_feature': 'VorgangsBeschreibung',
},
save_result=True,
)
# output: DataFrame containing target feature with
# number of occurrences and associated ObjectIDs
# ** embedding pipe
# ?? still needed?
# using similarity between entries to catch duplicates with typo or similar content
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
model_spacy = spacy.load('de_dep_news_trf')
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True)
pipe_embds.add(
list_cosSim_dupl_candidates,
{'save_candidates': True,
'saving_path': SAVE_PATH_FOLDER,
'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
'pipeline': pipe_embds}, save_result=True)
# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
# pipe_embds.add(
# filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
# )
# pipe_embds.add(
# list_cosSim_dupl_candidates,
# {
# 'save_candidates': True,
# 'saving_path': SAVE_PATH_FOLDER,
# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
# 'pipeline': pipe_embds,
# },
# save_result=True,
# )
# ** Merge duplicates
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
pipe_merge.add(merge_similarity_dupl, save_result=True)
# pipe_merge.add(merge_similarity_dupl, save_result=True)
pipe_merge.add(
merge_similarity_dupl,
{
'model': model_stfr,
'cos_sim_threshold': THRESHOLD_SIMILARITY,
},
save_result=True,
)
# ** token analysis
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True)
pipe_token_analysis.add(
build_token_graph,
{
'model': model_spacy,
},
save_result=True,
)
# ** timeline analysis
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_timeline.add(
remove_non_relevant_obj_ids,
{
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
},
save_result=True,
)
pipe_timeline.add(
generate_model_input,
{
'target_feature_name': 'nlp_model_input',
'model_input_features': MODEL_INPUT_FEATURES,
},
)
pipe_timeline.add(
filter_activities_per_obj_id,
{
'activity_feature': ACTIVITY_FEATURE,
'relevant_activity_types': ACTIVITY_TYPES,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
},
)
pipe_timeline.add(
get_timeline_candidates,
{
'model': model_stfr,
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
'model_input_feature': 'nlp_model_input',
},
save_result=True,
)

View File

@ -1,38 +1,47 @@
from typing import Any
import os
import shutil
import pickle
import shutil
import tomllib
from pathlib import Path
from typing import Any
from lang_main.loggers import logger_shared_helpers as logger
# ** Lib
def create_saving_folder(
saving_path_folder: str | Path,
overwrite_existing: bool = False,
) -> None:
# check for existence of given path
if not os.path.exists(saving_path_folder):
os.makedirs(saving_path_folder)
if isinstance(saving_path_folder, str):
saving_path_folder = Path(saving_path_folder)
if not saving_path_folder.exists():
saving_path_folder.mkdir(parents=True)
else:
if overwrite_existing:
# overwrite if desired (deletes whole path and re-creates it)
shutil.rmtree(saving_path_folder)
os.makedirs(saving_path_folder)
else:
logger.info((f"Path >>{saving_path_folder}<< already exists and remained "
"unchanged. If you want to overwrite this path, use parameter "
">>overwrite_existing<<."))
logger.info(
(
f'Path >>{saving_path_folder}<< already exists and remained '
f'unchanged. If you want to overwrite this path, use parameter '
f'>>overwrite_existing<<.'
)
)
def load_toml_config(
path_to_toml: str | Path,
) -> dict[str, Any]:
with open(path_to_toml, "rb") as f:
with open(path_to_toml, 'rb') as f:
data = tomllib.load(f)
logger.info("Loaded TOML config file successfully.")
logger.info('Loaded TOML config file successfully.')
return data
# saving and loading using pickle
# careful: pickling from unknown sources can be dangerous
def save_pickle(
@ -41,16 +50,18 @@ def save_pickle(
) -> None:
with open(path, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(f"Saved file successfully under {path}")
logger.info(f'Saved file successfully under {path}')
def load_pickle(
path: str | Path,
) -> Any:
with open(path, 'rb') as file:
obj = pickle.load(file)
logger.info("Loaded file successfully.")
logger.info('Loaded file successfully.')
return obj
# TODO: remove, too specialised for common application
"""
def filter_candidates_idx(

View File

@ -1,4 +1,4 @@
from typing import TypeAlias, Literal
from typing import Literal, TypeAlias
import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc

View File

@ -13,29 +13,25 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
"ename": "ModuleNotFoundError",
"evalue": "No module named 'ihm_analyse'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
]
}
],
"source": [
"from ihm_analyse import CONFIG\n",
"from ihm_analyse.lib.preprocess import (\n",
"from lang_main import CONFIG\n",
"from lang_main.lib.preprocess import (\n",
" load_raw_data,\n",
" remove_duplicates,\n",
" remove_NA,\n",
@ -47,8 +43,8 @@
" list_cosSim_dupl_candidates,\n",
" merge_similarity_dupl,\n",
")\n",
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n",
"from ihm_analyse.lib.helpers import (\n",
"from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
"from lang_main.lib.helpers import (\n",
" save_pickle, \n",
" load_pickle, \n",
" create_saving_folder,\n",

View File

@ -1,28 +1,42 @@
from typing import cast
from pathlib import Path
import pandas as pd
import plotly.express as px
from dash import (
Dash,
html,
dcc,
callback,
Output,
Input,
Output,
State,
callback,
dash_table,
dcc,
html,
)
import plotly.express as px
import pandas as pd
from lang_main import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
from lang_main import load_pickle
from lang_main.types import TimelineCandidates, ObjectID
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
data = cast(DataFrame, load_pickle('./data.pkl'))
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_tl = Path(
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
)
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
cands = ret[0]
texts = ret[1]
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
# data = cast(DataFrame, load_pickle(p_df))
# cands = cast(TimelineCandidates, load_pickle(p_cands))
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
table_feats = [
'ErstellungsDatum',
'ErledigungsDatum',
@ -52,25 +66,28 @@ hover_data = {
app = Dash(prevent_initial_callbacks=True)
app.layout = [
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
html.Div(children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder="ObjektID auswählen...",
)
]),
html.Div(children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]),
html.Div(children=[
dash_table.DataTable(id='table-candidates')
]),
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
html.Div(
children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder='ObjektID auswählen...',
),
]
),
html.Div(
children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]
),
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
]
@callback(
Output('object_text', 'children'),
Input('dropdown-selection', 'value'),
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
headline = f'HObjektText: {obj_text}'
return headline
@callback(
Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'),
@ -90,9 +108,10 @@ def update_obj_text(obj_id):
def update_choice_candidates(obj_id):
obj_id = int(obj_id)
cands_obj_id = cands[obj_id]
choices = list(range(1, len(cands_obj_id)+1))
choices = list(range(1, len(cands_obj_id) + 1))
return choices
@callback(
Output('graph-output', 'figure'),
Input('choice-candidates', 'value'),
@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
title = f'HObjektText: {obj_text}'
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
# figure
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
title=title,
hover_data=hover_data,
)
fig.update_traces(
mode='markers+lines',
marker=markers,
marker_symbol='diamond'
)
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
fig.update_xaxes(
tickformat="%B\n%Y",
tickformat='%B\n%Y',
rangeslider_visible=True,
)
fig.update_yaxes(type='category')
fig.update_layout(hovermode="x unified")
fig.update_layout(hovermode='x unified')
return fig
@callback(
[Output('table-candidates', 'data'),
Output('table-candidates', 'columns')],
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
@ -141,13 +156,13 @@ def update_table_candidates(index, obj_id):
obj_id = int(obj_id)
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
df = (df
.filter(items=table_feats, axis=1)
.sort_values(by='ErstellungsDatum', ascending=True))
cols = [{"name": i, "id": i} for i in df.columns]
df = df.filter(items=table_feats, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
)
cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings
for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
table_data = df.to_dict('records')
return table_data, cols
if __name__ == '__main__':
app.run(debug=True)

View File

@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -0,0 +1,663 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from lang_main import load_pickle"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
"metadata": {},
"outputs": [],
"source": [
"def get_files(path: str) -> tuple[Path, ...]:\n",
" p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
" assert p.exists(), \"path does not exist\"\n",
" return tuple(p.glob(r'*'))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
]
}
],
"source": [
"file = files[-1]\n",
"ret = load_pickle(file)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>3108</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
" <td>36</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2559</th>\n",
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
" <td>46</td>\n",
" <td>1</td>\n",
" <td>[211]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2558</th>\n",
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
" <td>30</td>\n",
" <td>1</td>\n",
" <td>[93]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2557</th>\n",
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" <td>[1707]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2556</th>\n",
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
" <td>173</td>\n",
" <td>1</td>\n",
" <td>[1]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6782</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>106</td>\n",
" <td>2</td>\n",
" <td>[306, 326]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4545 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry ... num_assoc_obj_ids\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n",
"33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n",
"131 Tägliche Überprüfung der Ölabscheider ... 4\n",
"160 Wöchentliche Kontrolle der C-Anlagen ... 11\n",
"140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n",
"... ... ... ...\n",
"2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n",
"2558 T-Warp-Let-Off1 schleppfehler ... 1\n",
"2557 Fahrräder wurden gewartet und gereinigt. ... 1\n",
"2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n",
"6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n",
"\n",
"[4545 rows x 5 columns]"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 69,
"id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
]
}
],
"source": [
"file = files[-1]\n",
"ret = load_pickle(file)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>162</td>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>33</td>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>3108</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>131</td>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>160</td>\n",
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
" <td>36</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>140</td>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6756</th>\n",
" <td>2559</td>\n",
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
" <td>46</td>\n",
" <td>1</td>\n",
" <td>[211]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6757</th>\n",
" <td>2558</td>\n",
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
" <td>30</td>\n",
" <td>1</td>\n",
" <td>[93]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6758</th>\n",
" <td>2557</td>\n",
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" <td>[1707]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6759</th>\n",
" <td>2556</td>\n",
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
" <td>173</td>\n",
" <td>1</td>\n",
" <td>[1]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6760</th>\n",
" <td>6782</td>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>106</td>\n",
" <td>2</td>\n",
" <td>[306, 326]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4545 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" index ... num_assoc_obj_ids\n",
"0 162 ... 206\n",
"1 33 ... 74\n",
"2 131 ... 4\n",
"3 160 ... 11\n",
"4 140 ... 166\n",
"... ... ... ...\n",
"6756 2559 ... 1\n",
"6757 2558 ... 1\n",
"6758 2557 ... 1\n",
"6759 2556 ... 1\n",
"6760 6782 ... 2\n",
"\n",
"[4545 rows x 6 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 72,
"id": "cbf0b450-ec00-471f-9627-717e52c5471d",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.auto import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
"metadata": {},
"outputs": [],
"source": [
"tup = tuple(i for i in range(100000000))"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ade9c6f4e61410fb93f35e43222705b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/100000000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"num = 0\n",
"for i in tqdm(tup):\n",
" num += i"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4999999950000000"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36366147-3632-4518-936e-878563305e49",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 30,
"id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "17156aa0-8fd6-407b-b014-698df0e534a9",
"metadata": {},
"outputs": [],
"source": [
"arr = np.random.rand(1000,1000)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
"metadata": {},
"outputs": [],
"source": [
"idx = np.argwhere(arr >= 0.97)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30220"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(idx)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "5b78436e-a828-42bd-a5ed-ae6045349391",
"metadata": {},
"outputs": [],
"source": [
"batch = idx[:200]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "75edc50e-b64c-4319-8f74-27653ed3452c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"tuple(map(tuple, batch))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"tuple(tuple(x) for x in batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}