STRF for similarity duplicates, time analysis pipeline, enhanced config

This commit is contained in:
Florian Förster 2024-05-29 16:34:31 +02:00
parent 5d2c97165a
commit bb987e2108
30 changed files with 1875 additions and 693 deletions

View File

@ -34,3 +34,15 @@ trials = [
"plotly>=5.22.0", "plotly>=5.22.0",
"dash>=2.17.0", "dash>=2.17.0",
] ]
[tool.ruff]
line-length = 94
indent-width = 4
target-version = "py311"
[tool.ruff.format]
quote-style = "single"
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = ["E", "F", "I"]

View File

@ -1,33 +1,43 @@
import typing import typing
import warnings
from pathlib import Path
from typing import cast
from pandas import DataFrame, Series from lang_main import (
TokenGraph,
from ihm_analyse import (
SAVE_PATH_FOLDER,
PATH_TO_DATASET,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
DO_PREPROCESSING,
DO_TOKEN_ANALYSIS,
DO_GRAPH_POSTPROCESSING,
create_saving_folder, create_saving_folder,
load_pickle, load_pickle,
Embedding,
Index,
TokenGraph,
) )
from ihm_analyse.predefined_pipes import ( from lang_main.constants import (
pipe_target_feat, DO_GRAPH_POSTPROCESSING,
pipe_embds, DO_PREPROCESSING,
DO_TIME_ANALYSIS,
DO_TOKEN_ANALYSIS,
INPUT_PATH_FOLDER,
PATH_TO_DATASET,
SAVE_PATH_FOLDER,
SKIP_GRAPH_POSTPROCESSING,
SKIP_PREPROCESSING,
SKIP_TIME_ANALYSIS,
SKIP_TOKEN_ANALYSIS,
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
)
# Embedding,
# PandasIndex,
from lang_main.pipelines.predefined import (
pipe_merge, pipe_merge,
pipe_target_feat,
pipe_timeline,
pipe_token_analysis, pipe_token_analysis,
) )
""" from lang_main.types import (
# ** config parameters ObjectID,
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) TimelineCandidates,
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset']) )
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters'] from pandas import DataFrame, Series
"""
# ** processing pipeline # ** processing pipeline
def run_preprocessing() -> DataFrame: def run_preprocessing() -> DataFrame:
@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame:
overwrite_existing=True, overwrite_existing=True,
) )
# run pipelines # run pipelines
ret = typing.cast(tuple[DataFrame], ret = typing.cast(
pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))) tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
)
target_feat_data = ret[0] target_feat_data = ret[0]
# only entries with more than threshold amount of characters # only entries with more than threshold amount of characters
data_filter = typing.cast(Series, data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
(target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
subset_data = target_feat_data.loc[data_filter, 'entry'].copy() # dupl_idx_pairs, embds = typing.cast(
dupl_idx_pairs, embds = typing.cast( # tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]], # pipe_embds.run(starting_values=(subset_data,)),
pipe_embds.run(starting_values=(subset_data,)) # )
)
# merge duplicates, results saved separately # merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame], subset_data = target_feat_data.loc[data_filter].copy()
pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs))) ret = typing.cast(
tuple[DataFrame],
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
pipe_merge.run(starting_values=(subset_data,)),
)
preprocessed_data = ret[0] preprocessed_data = ret[0]
return preprocessed_data return preprocessed_data
def run_token_analysis( def run_token_analysis(
preprocessed_data: DataFrame, preprocessed_data: DataFrame,
) -> TokenGraph: ) -> TokenGraph:
# build token graph # build token graph
(tk_graph,) = typing.cast(tuple[TokenGraph], (tk_graph,) = typing.cast(
pipe_token_analysis.run(starting_values=(preprocessed_data,))) tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,))
)
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False) tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
tk_graph.to_pickle(SAVE_PATH_FOLDER, tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
filename=f'{pipe_token_analysis.name}-TokenGraph')
return tk_graph return tk_graph
def run_graph_postprocessing( def run_graph_postprocessing(
tk_graph: TokenGraph, tk_graph: TokenGraph,
) -> TokenGraph: ) -> TokenGraph:
# filter graph by edge weight and remove single nodes (no connection) # filter graph by edge weight and remove single nodes (no connection)
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT) tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
tk_graph_filtered.save_graph(SAVE_PATH_FOLDER, tk_graph_filtered.save_graph(
filename='TokenGraph-filtered', SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
directed=False) )
tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER, tk_graph_filtered.to_pickle(
filename=f'{pipe_token_analysis.name}-TokenGraph-filtered') SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
)
return tk_graph_filtered return tk_graph_filtered
if __name__ == '__main__':
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
filename = 'without_nan'
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
ret = load_pickle(loading_path)
preprocessed_data = ret[0]
ret = cast(
tuple[TimelineCandidates, dict[ObjectID, str]],
pipe_timeline.run(starting_values=(preprocessed_data,)),
)
return ret
def verify_path(
loading_path: Path,
) -> None:
if not loading_path.exists():
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
def main() -> None:
pre_step_skipped: bool = False
# ** preprocess # ** preprocess
if DO_PREPROCESSING: if DO_PREPROCESSING and not SKIP_PREPROCESSING:
preprocessed_data = run_preprocessing() preprocessed_data = run_preprocessing()
else: elif not SKIP_PREPROCESSING:
# !! hardcoded result filenames # !! hardcoded result filenames
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*' target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0] loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
ret = typing.cast(tuple[DataFrame], verify_path(loading_path)
load_pickle(target_filepath)) ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
preprocessed_data = ret[0] preprocessed_data = ret[0]
# ** token analysis
if DO_TOKEN_ANALYSIS:
preprocessed_data_trunc = typing.cast(DataFrame,
preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
else: else:
pre_step_skipped = True
warnings.warn('No preprocessing action selected. Skipped.')
# sys.exit(0)
# ** token analysis
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
if pre_step_skipped:
raise RuntimeError(
'Preprocessing step skipped. Token analysis cannot be performed.'
)
preprocessed_data_trunc = typing.cast(
DataFrame, preprocessed_data[['entry', 'num_occur']].copy()
) # type: ignore
tk_graph = run_token_analysis(preprocessed_data_trunc)
elif not SKIP_TOKEN_ANALYSIS:
# !! hardcoded result filenames # !! hardcoded result filenames
# whole graph # whole graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph' filename: str = f'{pipe_token_analysis.name}-TokenGraph'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path)) # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph = TokenGraph.from_pickle(loading_path) tk_graph = TokenGraph.from_pickle(loading_path)
# ** graph postprocessing pre_step_skipped = False
if DO_GRAPH_POSTPROCESSING:
tk_graph_filtered = run_graph_postprocessing(tk_graph)
else: else:
pre_step_skipped = True
warnings.warn('No token analysis action selected. Skipped.')
# ** graph postprocessing
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
if pre_step_skipped:
raise RuntimeError(
(
'Preprocessing or token analysis step skipped. '
'Graph postprocessing cannot be performed.'
)
)
tk_graph_filtered = run_graph_postprocessing(tk_graph)
elif not SKIP_GRAPH_POSTPROCESSING:
# !! hardcoded result filenames # !! hardcoded result filenames
# filtered graph # filtered graph
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered' filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
verify_path(loading_path)
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path)) # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
tk_graph_filtered = TokenGraph.from_pickle(loading_path) tk_graph_filtered = TokenGraph.from_pickle(loading_path)
pre_step_skipped = False
else:
warnings.warn('No graph postprocessing action selected. Skipped.')
# ** time analysis
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
# no check for fails, runs separately
ret = run_time_analysis()
elif not SKIP_TIME_ANALYSIS:
...
else:
warnings.warn('No time analysis action selected. Skipped.')
if __name__ == '__main__':
main()

Binary file not shown.

View File

@ -1,17 +1,21 @@
# lang_main: Config file # lang_main: Config file
[paths] [paths]
results = './results/test_new2/' inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
dataset = './01_2_Rohdaten_neu/Export4.csv' results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
#results = './results/Export7/' #results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/' #results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control] [control]
preprocessing = false preprocessing = true
token_analysis = true preprocessing_skip = false
token_analysis = false
token_analysis_skip = true
graph_postprocessing = false graph_postprocessing = false
graph_postprocessing_skip = true
#[export_filenames] #[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'

View File

@ -0,0 +1,59 @@
# lang_main: Config file
[paths]
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/'
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = true
token_analysis = false
token_analysis_skip = true
graph_postprocessing = false
graph_postprocessing_skip = true
time_analysis = true
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

12
scripts/test.py Normal file
View File

@ -0,0 +1,12 @@
from lang_main.analysis.preprocessing import clean_string_slim
from lang_main.constants import SAVE_PATH_FOLDER
print(SAVE_PATH_FOLDER)
txt = """
Wir feiern den Jahrestag, olé!
tel:::: !!!!???? +++49 123 456 789
Doch leben wir länger.
"""
print(txt)
print(clean_string_slim(txt))

View File

@ -1,18 +1,19 @@
from typing import Final, Any
import inspect import inspect
import sys
import logging import logging
from time import gmtime import shutil
import sys
from pathlib import Path from pathlib import Path
from time import gmtime
from typing import Any, Final
from lang_main.shared import (
save_pickle,
load_pickle,
create_saving_folder,
load_toml_config,
)
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.analysis.graphs import TokenGraph from lang_main.analysis.graphs import TokenGraph
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.shared import (
create_saving_folder,
load_pickle,
load_toml_config,
save_pickle,
)
__all__ = [ __all__ = [
'save_pickle', 'save_pickle',
@ -32,37 +33,30 @@ logging.basicConfig(
datefmt=LOG_DATE_FMT, datefmt=LOG_DATE_FMT,
) )
USE_INTERNAL_CONFIG: Final[bool] = True CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
# load config data: internal/external # load config data: internal/external
if USE_INTERNAL_CONFIG: if USE_INTERNAL_CONFIG:
curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
pkg_dir = curr_file_dir.parent
config_path = Path(pkg_dir, 'config.toml')
loaded_config = load_toml_config(path_to_toml=config_path)
CONFIG: Final[dict[str, Any]] = loaded_config.copy()
else: else:
raise NotImplementedError("External config data not implemented yet.") caller_file = Path(inspect.stack()[-1].filename)
if not caller_file.exists():
raise FileNotFoundError('Caller file could not be correctly retrieved.')
cfg_path_external = caller_file.parent / CONFIG_FILENAME
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(
(
'No config file was found. A new one with default values was created '
'in the execution path. Please fill in the necessary values and '
'restart the programm.'
)
)
# raise NotImplementedError("External config data not implemented yet.")
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
# ** paths CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
# ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
# ** export
# ** preprocessing
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\
CONFIG['preprocess']['filename_cossim_filter_candidates']
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\
CONFIG['preprocess']['threshold_amount_characters']
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
# ** token analysis
# ** graph postprocessing
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
# ** time analysis
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts']

View File

@ -1,18 +1,18 @@
import typing
from typing import Any, Self, Literal, overload, Final
import sys
from collections.abc import Hashable
from pathlib import Path
import copy import copy
import sys
import typing
from collections.abc import Hashable, Iterable
from pathlib import Path
from typing import Any, Final, Literal, Self, overload
import networkx as nx
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
from networkx import Graph, DiGraph from networkx import DiGraph, Graph
import networkx as nx
from pandas import DataFrame from pandas import DataFrame
from lang_main.loggers import logger_graphs as logger from lang_main.loggers import logger_graphs as logger
from lang_main.shared import save_pickle, load_pickle from lang_main.shared import load_pickle, save_pickle
# TODO change logging behaviour, add logging to file # TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False LOGGING_DEFAULT: Final[bool] = False
@ -31,8 +31,7 @@ def get_graph_metadata(
min_edge_weight: int = 1_000_000 min_edge_weight: int = 1_000_000
max_edge_weight: int = 0 max_edge_weight: int = 0
for edge in graph.edges: for edge in graph.edges:
weight = typing.cast(int, weight = typing.cast(int, graph[edge[0]][edge[1]]['weight'])
graph[edge[0]][edge[1]]['weight'])
if weight < min_edge_weight: if weight < min_edge_weight:
min_edge_weight = weight min_edge_weight = weight
if weight > max_edge_weight: if weight > max_edge_weight:
@ -54,18 +53,20 @@ def get_graph_metadata(
) )
if logging: if logging:
logger.info((f"Graph properties: {num_nodes} Nodes, " logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
f"{num_edges} Edges")) logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
logger.info(f"Node memory: {node_mem / 1024:.2f} KB") logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB") logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
logger.info(f"Total memory: {total_mem / 1024:.2f} KB")
return graph_info return graph_info
def update_graph( def update_graph(
graph: Graph | DiGraph, graph: Graph | DiGraph,
parent: Hashable, *,
child: Hashable, batch: Iterable[tuple[Hashable, Hashable]] | None = None,
parent: Hashable | None = None,
child: Hashable | None = None,
weight_connection: int = 1, weight_connection: int = 1,
) -> None: ) -> None:
# !! not necessary to check for existence of nodes # !! not necessary to check for existence of nodes
@ -78,7 +79,9 @@ def update_graph(
graph.add_node(child) graph.add_node(child)
""" """
# check if edge not in Graph # check if edge not in Graph
if not graph.has_edge(parent, child): if batch is not None:
graph.add_edges_from(batch, weight=weight_connection)
elif not graph.has_edge(parent, child):
# create new edge, nodes will be created if not already present # create new edge, nodes will be created if not already present
graph.add_edge(parent, child, weight=weight_connection) graph.add_edge(parent, child, weight=weight_connection)
else: else:
@ -87,16 +90,15 @@ def update_graph(
weight += weight_connection weight += weight_connection
graph[parent][child]['weight'] = weight graph[parent][child]['weight'] = weight
# build undirected adjacency matrix # build undirected adjacency matrix
def convert_graph_to_undirected( def convert_graph_to_undirected(
graph: DiGraph, graph: DiGraph,
logging: bool = LOGGING_DEFAULT, logging: bool = LOGGING_DEFAULT,
) -> Graph: ) -> Graph:
# get adjacency matrix # get adjacency matrix
adj_mat = typing.cast(DataFrame, adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
nx.to_pandas_adjacency(G=graph, dtype=np.uint32)) arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
arr = typing.cast(npt.NDArray[np.uint32],
adj_mat.to_numpy())
# build undirected array: adding edges of lower triangular matrix to upper one # build undirected array: adding edges of lower triangular matrix to upper one
arr_upper = np.triu(arr) arr_upper = np.triu(arr)
arr_lower = np.tril(arr) arr_lower = np.tril(arr)
@ -104,18 +106,17 @@ def convert_graph_to_undirected(
arr_new = arr_upper + arr_lower arr_new = arr_upper + arr_lower
# assign new data and create graph # assign new data and create graph
adj_mat.loc[:] = arr_new # type: ignore adj_mat.loc[:] = arr_new # type: ignore
graph_undir = typing.cast(Graph, graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
nx.from_pandas_adjacency(df=adj_mat))
# info about graph # info about graph
if logging: if logging:
logger.info("Successfully converted graph to one with undirected edges.") logger.info('Successfully converted graph to one with undirected edges.')
_ = get_graph_metadata(graph=graph_undir, logging=logging) _ = get_graph_metadata(graph=graph_undir, logging=logging)
return graph_undir return graph_undir
class TokenGraph(DiGraph):
class TokenGraph(DiGraph):
def __init__( def __init__(
self, self,
name: str = 'TokenGraph', name: str = 'TokenGraph',
@ -138,9 +139,11 @@ class TokenGraph(DiGraph):
return self.__str__() return self.__str__()
def __str__(self) -> str: def __str__(self) -> str:
return (f"TokenGraph(name: {self.name}, number of nodes: " return (
f"{len(self.nodes)}, number of edges: " f'TokenGraph(name: {self.name}, number of nodes: '
f"{len(self.edges)})") f'{len(self.nodes)}, number of edges: '
f'{len(self.edges)})'
)
# !! only used to verify that saving was done correctly # !! only used to verify that saving was done correctly
""" """
@ -186,24 +189,19 @@ class TokenGraph(DiGraph):
self, self,
inplace: Literal[True] = ..., inplace: Literal[True] = ...,
logging: bool | None = ..., logging: bool | None = ...,
) -> None: ) -> None: ...
...
@overload @overload
def to_undirected( def to_undirected(
self, self,
inplace: Literal[False], inplace: Literal[False],
logging: bool | None = ..., logging: bool | None = ...,
) -> Graph: ) -> Graph: ...
...
@overload @overload
def to_undirected( def to_undirected(
self, self, inplace: bool = ..., logging: bool | None = ...
inplace: bool = ..., ) -> Graph | None: ...
logging: bool | None = ...
) -> Graph | None:
...
def to_undirected( def to_undirected(
self, self,
@ -213,10 +211,10 @@ class TokenGraph(DiGraph):
if logging is None: if logging is None:
logging = self.logging logging = self.logging
self._undirected = convert_graph_to_undirected(graph=self, self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
logging=logging) self._metadata_undirected = get_graph_metadata(
self._metadata_undirected = get_graph_metadata(graph=self._undirected, graph=self._undirected, logging=logging
logging=logging) )
if not inplace: if not inplace:
return self._undirected return self._undirected
@ -227,11 +225,11 @@ class TokenGraph(DiGraph):
if logging is None: if logging is None:
logging = self.logging logging = self.logging
self._metadata_directed = get_graph_metadata(graph=self, self._metadata_directed = get_graph_metadata(graph=self, logging=logging)
logging=logging)
if self._undirected is not None: if self._undirected is not None:
self._metadata_undirected = get_graph_metadata(graph=self._undirected, self._metadata_undirected = get_graph_metadata(
logging=logging) graph=self._undirected, logging=logging
)
def filter_by_edge_weight( def filter_by_edge_weight(
self, self,
@ -254,8 +252,7 @@ class TokenGraph(DiGraph):
filtered_graph = self.copy() filtered_graph = self.copy()
for edge in original_graph_edges: for edge in original_graph_edges:
weight = typing.cast(int, weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
filtered_graph[edge[0]][edge[1]]['weight'])
if weight < threshold: if weight < threshold:
filtered_graph.remove_edge(edge[0], edge[1]) filtered_graph.remove_edge(edge[0], edge[1])
@ -304,9 +301,9 @@ class TokenGraph(DiGraph):
filename: str | None = None, filename: str | None = None,
) -> Path: ) -> Path:
if filename is not None: if filename is not None:
saving_path = path.joinpath(f"{filename}") saving_path = path.joinpath(f'{filename}')
else: else:
saving_path = path.joinpath(f"{self.name}") saving_path = path.joinpath(f'{self.name}')
return saving_path return saving_path
@ -341,12 +338,11 @@ class TokenGraph(DiGraph):
elif not directed and self._undirected is not None: elif not directed and self._undirected is not None:
target_graph = self._undirected target_graph = self._undirected
else: else:
raise ValueError("No undirected graph available.") raise ValueError('No undirected graph available.')
saving_path = saving_path.with_suffix('.graphml') saving_path = saving_path.with_suffix('.graphml')
nx.write_graphml(G=target_graph, path=saving_path) nx.write_graphml(G=target_graph, path=saving_path)
logger.info(("Successfully saved graph as GraphML file " logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
f"under {saving_path}."))
def to_pickle( def to_pickle(
self, self,
@ -378,12 +374,12 @@ class TokenGraph(DiGraph):
match path.suffix: match path.suffix:
case '.graphml': case '.graphml':
graph = typing.cast(Self, nx.read_graphml(path, node_type=int)) graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
logger.info(f"Successfully loaded graph from GraphML file {path}.") logger.info(f'Successfully loaded graph from GraphML file {path}.')
case '.pkl' | '.pickle': case '.pkl' | '.pickle':
graph = typing.cast(Self, load_pickle(path)) graph = typing.cast(Self, load_pickle(path))
logger.info(f"Successfully loaded graph from pickle file {path}.") logger.info(f'Successfully loaded graph from pickle file {path}.')
case _: case _:
raise ValueError("File format not supported.") raise ValueError('File format not supported.')
return graph return graph
@ -396,7 +392,7 @@ class TokenGraph(DiGraph):
path = Path(path) path = Path(path)
if path.suffix not in ('.pkl', '.pickle'): if path.suffix not in ('.pkl', '.pickle'):
raise ValueError("File format not supported.") raise ValueError('File format not supported.')
graph = typing.cast(Self, load_pickle(path)) graph = typing.cast(Self, load_pickle(path))

View File

@ -1,29 +1,29 @@
from typing import cast, Callable import re
from collections.abc import Iterable from collections.abc import Iterable
from itertools import combinations from itertools import combinations
import re
from math import factorial from math import factorial
from pathlib import Path from pathlib import Path
from typing import Callable, cast
import numpy as np import numpy as np
from torch import Tensor
from pandas import DataFrame, Series
import pandas as pd import pandas as pd
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from sentence_transformers import SentenceTransformer
import sentence_transformers import sentence_transformers
import sentence_transformers.util import sentence_transformers.util
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
from tqdm import tqdm from tqdm import tqdm
from lang_main.types import Embedding, PandasIndex
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.shared import ( from lang_main.analysis.shared import (
candidates_by_index,
similar_index_connection_graph, similar_index_connection_graph,
similar_index_groups, similar_index_groups,
) )
#from lang_main.analysis.graphs import update_graph, get_graph_metadata from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.types import Embedding, PandasIndex
# ** (1) dataset preparation: loading and simple preprocessing # ** (1) dataset preparation: loading and simple preprocessing
@ -67,11 +67,16 @@ def load_raw_data(
parse_dates=date_cols, parse_dates=date_cols,
dayfirst=True, dayfirst=True,
) )
logger.info("Loaded dataset successfully.") logger.info('Loaded dataset successfully.')
logger.info((f"Dataset properties: number of entries: {len(data)}, " logger.info(
f"number of features {len(data.columns)}")) (
f'Dataset properties: number of entries: {len(data)}, '
f'number of features {len(data.columns)}'
)
)
return (data,) return (data,)
def remove_duplicates( def remove_duplicates(
data: DataFrame, data: DataFrame,
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
@ -89,7 +94,7 @@ def remove_duplicates(
""" """
# obtain info about duplicates over all features # obtain info about duplicates over all features
duplicates_filt = data.duplicated() duplicates_filt = data.duplicated()
logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}") logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
# drop duplicates # drop duplicates
wo_duplicates = data.drop_duplicates(ignore_index=True) wo_duplicates = data.drop_duplicates(ignore_index=True)
duplicates_subset: list[str] = [ duplicates_subset: list[str] = [
@ -97,16 +102,26 @@ def remove_duplicates(
'ObjektID', 'ObjektID',
] ]
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset) duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
logger.info(("Number of duplicates over subset " logger.info(
f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}")) (
wo_duplicates =\ 'Number of duplicates over subset '
wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy() f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
logger.info("Removed all duplicates from dataset successfully.") )
logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, " )
f"number of features {len(wo_duplicates.columns)}")) wo_duplicates = wo_duplicates.drop_duplicates(
subset=duplicates_subset, ignore_index=True
).copy()
logger.info('Removed all duplicates from dataset successfully.')
logger.info(
(
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
f'number of features {len(wo_duplicates.columns)}'
)
)
return (wo_duplicates,) return (wo_duplicates,)
def remove_NA( def remove_NA(
data: DataFrame, data: DataFrame,
target_features: list[str] = [ target_features: list[str] = [
@ -128,15 +143,16 @@ def remove_NA(
dataset with removed NA entries for given subset of features dataset with removed NA entries for given subset of features
""" """
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.") logger.info(
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
)
return (wo_NA,) return (wo_NA,)
# ** (2) entry-based cleansing # ** (2) entry-based cleansing
# following functions clean and prepare specific entries, not whole dataset # following functions clean and prepare specific entries, not whole dataset
def clean_string_slim( def clean_string_slim(string: str) -> str:
string: str
) -> str:
"""mapping function to clean single string entries in a series (feature-wise) """mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features of the dataset, used to be applied element-wise for string features
@ -151,13 +167,16 @@ def clean_string_slim(
cleaned entry cleaned entry
""" """
# remove special chars # remove special chars
pattern = r'[\t\n\r\f\v]' pattern = r'[\t\n\r\f\v]+'
string = re.sub(pattern, ' ', string) string = re.sub(pattern, ' ', string)
pattern = r'([,;.:!?-_\+]){2,}'
# remove whitespaces at the beginning and the end # remove whitespaces at the beginning and the end
string = re.sub(pattern, r'\1', string)
string = string.strip() string = string.strip()
return string return string
def entry_wise_cleansing( def entry_wise_cleansing(
data: DataFrame, data: DataFrame,
target_feature: str, target_feature: str,
@ -165,10 +184,16 @@ def entry_wise_cleansing(
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
# apply given cleansing function to target feature # apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func) data[target_feature] = data[target_feature].map(cleansing_func)
logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< " logger.info(
f"for feature >>{target_feature}<<")) (
f'Successfully applied entry-wise cleansing procedure '
f'>>{cleansing_func.__name__}<< '
f'for feature >>{target_feature}<<'
)
)
return (data,) return (data,)
# ** in-depth analysis of one feature # ** in-depth analysis of one feature
# following functions try to gain insights on a given feature of the IHM dataset such # following functions try to gain insights on a given feature of the IHM dataset such
# as number of occurrences or associated Object IDs # as number of occurrences or associated Object IDs
@ -178,7 +203,7 @@ def analyse_feature(
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
# feature columns # feature columns
feature_entries = data[target_feature] feature_entries = data[target_feature]
logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}") logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
# obtain unique entries # obtain unique entries
unique_feature_entries = feature_entries.unique() unique_feature_entries = feature_entries.unique()
@ -186,7 +211,7 @@ def analyse_feature(
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids'] cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
result_df = pd.DataFrame(columns=cols) result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.): for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry) len_entry = len(entry)
filt = data[target_feature] == entry filt = data[target_feature] == entry
temp = data[filt] temp = data[filt]
@ -195,13 +220,10 @@ def analyse_feature(
num_assoc_obj_ids = len(assoc_obj_ids) num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum() num_dupl = filt.sum()
conc_df = pd.DataFrame(data=[[ conc_df = pd.DataFrame(
entry, data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
len_entry, columns=cols,
num_dupl, )
assoc_obj_ids,
num_assoc_obj_ids
]], columns=cols)
result_df = pd.concat([result_df, conc_df], ignore_index=True) result_df = pd.concat([result_df, conc_df], ignore_index=True)
@ -230,9 +252,9 @@ def build_embedding_map(
is_STRF = True is_STRF = True
if not any((is_spacy, is_STRF)): if not any((is_spacy, is_STRF)):
raise NotImplementedError("Model type unknown") raise NotImplementedError('Model type unknown')
for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.): for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
# verbose code: Pyright not inferring types correctly # verbose code: Pyright not inferring types correctly
idx = cast(int, idx) idx = cast(int, idx)
text = cast(str, text) text = cast(str, text)
@ -246,12 +268,17 @@ def build_embedding_map(
logger.debug(f'{embd.text=} has no vector') logger.debug(f'{embd.text=} has no vector')
elif is_STRF: elif is_STRF:
model = cast(SentenceTransformer, model) model = cast(SentenceTransformer, model)
embd = cast(Tensor, embd = cast(Tensor, model.encode(text, show_progress_bar=False))
model.encode(text, show_progress_bar=False))
embeddings[idx] = (embd, text) embeddings[idx] = (embd, text)
return embeddings, (is_spacy, is_STRF) return embeddings, (is_spacy, is_STRF)
# adapt interface
# use candidates by index function
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
# build similarity matrix out of embeddings # build similarity matrix out of embeddings
def build_cosSim_matrix( def build_cosSim_matrix(
data: Series, data: Series,
@ -259,10 +286,11 @@ def build_cosSim_matrix(
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]: ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
# build empty matrix # build empty matrix
df_index = data.index df_index = data.index
cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index, cosineSim_idx_matrix = pd.DataFrame(
index=df_index, dtype=np.float32) data=0.0, columns=df_index, index=df_index, dtype=np.float32
)
logger.info("Start building embedding map...") logger.info('Start building embedding map...')
# obtain embeddings based on used model # obtain embeddings based on used model
embds, (is_spacy, is_STRF) = build_embedding_map( embds, (is_spacy, is_STRF) = build_embedding_map(
@ -270,15 +298,15 @@ def build_cosSim_matrix(
model=model, model=model,
) )
logger.info("Embedding map built successfully.") logger.info('Embedding map built successfully.')
# apply index based mapping for efficient handling of large texts # apply index based mapping for efficient handling of large texts
combs = combinations(df_index, 2) combs = combinations(df_index, 2)
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2) total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
logger.info("Start calculation of similarity scores...") logger.info('Start calculation of similarity scores...')
for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.): for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
# print(f"{idx1=}, {idx2=}") # print(f"{idx1=}, {idx2=}")
embd1 = embds[idx1][0] embd1 = embds[idx1][0]
embd2 = embds[idx2][0] embd2 = embds[idx2][0]
@ -296,10 +324,11 @@ def build_cosSim_matrix(
cosineSim_idx_matrix.at[idx1, idx2] = cosSim cosineSim_idx_matrix.at[idx1, idx2] = cosSim
logger.info("Similarity scores calculated successfully.") logger.info('Similarity scores calculated successfully.')
return cosineSim_idx_matrix, embds return cosineSim_idx_matrix, embds
# obtain index pairs with cosine similarity # obtain index pairs with cosine similarity
# greater than or equal to given threshold value # greater than or equal to given threshold value
def filt_thresh_cosSim_matrix( def filt_thresh_cosSim_matrix(
@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix(
Series Series
series with multi index (index pairs) and corresponding similarity score series with multi index (index pairs) and corresponding similarity score
""" """
cosineSim_filt = cast(Series, cosineSim_filt = cast(
cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()) Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
)
return cosineSim_filt, embds return cosineSim_filt, embds
def list_cosSim_dupl_candidates( def list_cosSim_dupl_candidates(
cosineSim_filt: Series, cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]], embds: dict[int, tuple[Embedding, str]],
@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates(
list containing relevant index pairs for entries with similarity score greater than list containing relevant index pairs for entries with similarity score greater than
given threshold given threshold
""" """
logger.info("Start gathering of similarity candidates...") logger.info('Start gathering of similarity candidates...')
# compare found duplicates # compare found duplicates
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score'] columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns) df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
# get text content from embedding as second tuple entry # get text content from embedding as second tuple entry
content = [[ content = [
[
idx1, idx1,
embds[idx1][1], embds[idx1][1],
idx2, idx2,
embds[idx2][1], embds[idx2][1],
score, score,
]] ]
]
# add candidates to collection DataFrame # add candidates to collection DataFrame
df_conc = pd.DataFrame(columns=columns, data=content) df_conc = pd.DataFrame(columns=columns, data=content)
if df_candidates.empty: if df_candidates.empty:
@ -371,24 +404,27 @@ def list_cosSim_dupl_candidates(
# save index pairs # save index pairs
index_pairs.append((idx1, idx2)) index_pairs.append((idx1, idx2))
logger.info("Similarity candidates gathered successfully.") logger.info('Similarity candidates gathered successfully.')
if save_candidates: if save_candidates:
if saving_path is None: if saving_path is None:
raise ValueError(("Saving path must be provided if duplicate " raise ValueError(
"candidates should be saved.")) ('Saving path must be provided if duplicate ' 'candidates should be saved.')
)
elif pipeline is not None: elif pipeline is not None:
target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' target_filename = (
+ filename + '.xlsx') f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
)
elif pipeline is None: elif pipeline is None:
target_filename = f'{filename}.xlsx' target_filename = f'{filename}.xlsx'
logger.info("Saving similarity candidates...") logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename) target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path) df_candidates.to_excel(target_path)
logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.") logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
return index_pairs, embds return index_pairs, embds
# TODO: change implementation fully to SentenceTransformer # TODO: change implementation fully to SentenceTransformer
# usage of batch processing for embeddings, use candidate idx function # usage of batch processing for embeddings, use candidate idx function
# from time analysis --> moved to ``helpers.py`` # from time analysis --> moved to ``helpers.py``
@ -419,16 +455,24 @@ def similar_ids_groups(
yield list(id_group) yield list(id_group)
""" """
def merge_similarity_dupl( def merge_similarity_dupl(
data: DataFrame, data: DataFrame,
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]], model: SentenceTransformer,
cos_sim_threshold: float,
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
logger.info("Start merging of similarity candidates...") logger.info('Start merging of similarity candidates...')
# data # data
merged_data = data.copy() merged_data = data.copy()
model_input = merged_data['entry']
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids # graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs) similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph): for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group) similar_id_group = list(similar_id_group)
@ -454,10 +498,11 @@ def merge_similarity_dupl(
merged_data.update(merged_similar_data) merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group) merged_data = merged_data.drop(index=similar_id_group)
logger.info("Similarity candidates merged successfully.") logger.info('Similarity candidates merged successfully.')
return (merged_data.copy(),) return (merged_data.copy(),)
# merge duplicates # merge duplicates
def merge_similarity_dupl_old( def merge_similarity_dupl_old(
data: DataFrame, data: DataFrame,
@ -469,8 +514,7 @@ def merge_similarity_dupl_old(
# logger.info("Start merging of similarity candidates...") # logger.info("Start merging of similarity candidates...")
# iterate over index pairs # iterate over index pairs
for (i1, i2) in tqdm(dupl_idx_pairs): for i1, i2 in tqdm(dupl_idx_pairs):
# if an entry does not exist any more, skip this pair # if an entry does not exist any more, skip this pair
if i1 not in index or i2 not in index: if i1 not in index or i2 not in index:
continue continue
@ -521,14 +565,13 @@ def choose_cosSim_dupl_candidates(
given threshold given threshold
""" """
# compare found duplicates # compare found duplicates
columns = ['idx1', 'text1', 'idx2', 'text2', 'score'] columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns) df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
# get texts for comparison # get texts for comparison
text1 = embds[idx1][1] text1 = embds[idx1][1]
text2 = embds[idx2][1] text2 = embds[idx2][1]
@ -542,13 +585,15 @@ def choose_cosSim_dupl_candidates(
continue continue
# get text content from embedding as second tuple entry # get text content from embedding as second tuple entry
content = [[ content = [
[
idx1, idx1,
text1, text1,
idx2, idx2,
text2, text2,
score, score,
]] ]
]
df_conc = pd.DataFrame(columns=columns, data=content) df_conc = pd.DataFrame(columns=columns, data=content)
df_candidates = pd.concat([df_candidates, df_conc]) df_candidates = pd.concat([df_candidates, df_conc])

View File

@ -1,11 +1,71 @@
from typing import cast
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from typing import cast
import networkx as nx import networkx as nx
import numpy as np
import numpy.typing as npt
import sentence_transformers
import sentence_transformers.util
from networkx import Graph from networkx import Graph
from pandas import Series
from sentence_transformers import SentenceTransformer
from torch import Tensor
from tqdm.auto import tqdm
from lang_main.analysis.graphs import get_graph_metadata, update_graph
from lang_main.types import PandasIndex from lang_main.types import PandasIndex
from lang_main.analysis.graphs import update_graph, get_graph_metadata
def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
"""function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
ObjectID and tuple of index pairs which meet the cosine
similarity threshold
"""
# embeddings
batch = cast(list[str], data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
),
)
# cosine similarity
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
np.fill_diagonal(cos_sim, 0.0)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair
def similar_index_connection_graph( def similar_index_connection_graph(
@ -15,21 +75,21 @@ def similar_index_connection_graph(
# use this graph to get connected components (indices which belong together) # use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset # retain semantic connection on whole dataset
similar_id_graph = nx.Graph() similar_id_graph = nx.Graph()
for (idx1, idx2) in similar_idx_pairs: # for idx1, idx2 in similar_idx_pairs:
# inplace operation, parent/child do not really exist in undirected graph # # inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2) # update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False) graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info return similar_id_graph, graph_info
# TODO check returning tuple
def similar_index_groups( def similar_index_groups(
similar_id_graph: Graph, similar_id_graph: Graph,
) -> Iterator[tuple[PandasIndex, ...]]: ) -> Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices # groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]], ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
nx.connected_components(G=similar_id_graph))
for id_group in ids_groups: for id_group in ids_groups:
yield tuple(id_group) yield tuple(id_group)

View File

@ -1,21 +1,17 @@
from typing import cast
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from typing import cast
import numpy as np
import numpy.typing as npt
from pandas import DataFrame, Series from pandas import DataFrame, Series
from torch import Tensor
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import sentence_transformers
import sentence_transformers.util
from tqdm.auto import tqdm # TODO: check deletion from tqdm.auto import tqdm # TODO: check deletion
from lang_main.types import PandasIndex, ObjectID, TimelineCandidates
from lang_main.loggers import logger_timeline as logger
from lang_main.analysis.shared import ( from lang_main.analysis.shared import (
candidates_by_index,
similar_index_connection_graph, similar_index_connection_graph,
similar_index_groups, similar_index_groups,
) )
from lang_main.loggers import logger_timeline as logger
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
def non_relevant_obj_ids( def non_relevant_obj_ids(
@ -25,16 +21,16 @@ def non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText', feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID', feature_obj_id: str = 'ObjektID',
) -> tuple[ObjectID, ...]: ) -> tuple[ObjectID, ...]:
data = data.copy() data = data.copy()
ids_to_ignore: set[ObjectID] = set() ids_to_ignore: set[ObjectID] = set()
obj_ids = cast(Iterable[ObjectID], # actually NumPy array obj_ids = cast(
data[feature_obj_id].unique()) Iterable[ObjectID], # actually NumPy array
data[feature_obj_id].unique(),
)
for obj_id in obj_ids: for obj_id in obj_ids:
feats_per_obj_id = cast( feats_per_obj_id = cast(
Series, Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness]
data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness]
) )
# check for uniqueness of given feature for current ObjectID # check for uniqueness of given feature for current ObjectID
# ignore NaN values # ignore NaN values
@ -46,14 +42,15 @@ def non_relevant_obj_ids(
return tuple(ids_to_ignore) return tuple(ids_to_ignore)
def remove_non_relevant_obj_ids( def remove_non_relevant_obj_ids(
data: DataFrame, data: DataFrame,
thresh_unique_feat_per_id: int, thresh_unique_feat_per_id: int,
*, *,
feature_uniqueness: str = 'HObjektText', feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID', feature_obj_id: str = 'ObjektID',
) -> DataFrame: ) -> tuple[DataFrame]:
logger.info("Removing non-relevant ObjectIDs from dataset") logger.info('Removing non-relevant ObjectIDs from dataset')
data = data.copy() data = data.copy()
ids_to_ignore = non_relevant_obj_ids( ids_to_ignore = non_relevant_obj_ids(
data=data, data=data,
@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids(
) )
# only retain entries with ObjectIDs not in IDs to ignore # only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}") logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
logger.info("Non-relevant ObjectIDs removed successfully") logger.info('Non-relevant ObjectIDs removed successfully')
return data return (data,)
def filter_activities_per_obj_id(
data: DataFrame,
activity_feature: str = 'VorgangsTypName',
relevant_activity_types: Iterable[str] = (
'Reparaturauftrag (Portal)',
),
feature_obj_id: str = 'ObjektID',
threshold_num_activities: int = 1,
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
logger.info("Filtering activities per ObjectID")
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series,
data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities)
# index of series contains ObjectIDs
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
filt_entries_below_thresh = (data_filter_activities[feature_obj_id]
.isin(obj_ids_below_thresh))
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info("Activities per ObjectID filtered successfully")
return data_filter_activities, num_activities_per_obj_id
def generate_model_input( def generate_model_input(
data: DataFrame, data: DataFrame,
@ -107,8 +74,8 @@ def generate_model_input(
'VorgangsArtText', 'VorgangsArtText',
'VorgangsBeschreibung', 'VorgangsBeschreibung',
), ),
) -> DataFrame: ) -> tuple[DataFrame]:
logger.info("Generating concatenation of model input features") logger.info('Generating concatenation of model input features')
data = data.copy() data = data.copy()
model_input_features = list(model_input_features) model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str) input_features = data[model_input_features].fillna('').astype(str)
@ -116,9 +83,40 @@ def generate_model_input(
lambda x: ' - '.join(x), lambda x: ' - '.join(x),
axis=1, axis=1,
) )
logger.info("Model input generated successfully") logger.info('Model input generated successfully')
return (data,)
def filter_activities_per_obj_id(
data: DataFrame,
activity_feature: str = 'VorgangsTypName',
relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),
feature_obj_id: str = 'ObjektID',
threshold_num_activities: int = 1,
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
logger.info('Filtering activities per ObjectID')
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities
# index of series contains ObjectIDs
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
obj_ids_below_thresh
)
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info('Activities per ObjectID filtered successfully')
return data_filter_activities, num_activities_per_obj_id
return data
# for each obj_id in relevant_obj_ids # for each obj_id in relevant_obj_ids
## filter data for obj_id ## filter data for obj_id
@ -130,6 +128,7 @@ def generate_model_input(
## obtain idx pairs, yield ## obtain idx pairs, yield
## use idx pairs to get idx values of series ## use idx pairs to get idx values of series
def get_timeline_candidates_index( def get_timeline_candidates_index(
data: DataFrame, data: DataFrame,
num_activities_per_obj_id: Series, num_activities_per_obj_id: Series,
@ -140,14 +139,10 @@ def get_timeline_candidates_index(
model_input_feature: str = 'nlp_model_input', model_input_feature: str = 'nlp_model_input',
) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]: ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]:
# already sorted ObjIDs (descending regarding number of activities) # already sorted ObjIDs (descending regarding number of activities)
obj_ids = cast(Iterable[ObjectID], obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index)
num_activities_per_obj_id.index)
for obj_id in tqdm(obj_ids): for obj_id in tqdm(obj_ids):
data_per_obj_id = cast( data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
DataFrame,
data.loc[data[feature_obj_id]==obj_id]
)
data_model_input = data_per_obj_id[model_input_feature] data_model_input = data_per_obj_id[model_input_feature]
candidates_idx = candidates_by_index( candidates_idx = candidates_by_index(
@ -156,7 +151,7 @@ def get_timeline_candidates_index(
cos_sim_threshold=cos_sim_threshold, cos_sim_threshold=cos_sim_threshold,
) )
# directly process candidates # directly process candidates
candidates_idx = tuple(candidates_idx) # candidates_idx = tuple(candidates_idx)
similar_id_graph, _ = similar_index_connection_graph( similar_id_graph, _ = similar_index_connection_graph(
similar_idx_pairs=candidates_idx, similar_idx_pairs=candidates_idx,
) )
@ -164,63 +159,8 @@ def get_timeline_candidates_index(
for index_group in similar_index_groups(similar_id_graph): for index_group in similar_index_groups(similar_id_graph):
yield obj_id, index_group yield obj_id, index_group
# TODO: check application for duplicate removal # TODO: check application for duplicate removal
def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
"""function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]]
ObjectID and tuple of index pairs which meet the cosine
similarity threshold
"""
# embeddings
batch = cast(list[str],
data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
)
)
# cosine similarity
cos_sim = cast(
npt.NDArray,
sentence_transformers.util.cos_sim(embds, embds).numpy()
)
np.fill_diagonal(cos_sim, 0.)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64],
tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair
def transform_timeline_candidates( def transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]], candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates: ) -> TimelineCandidates:
@ -259,20 +199,52 @@ def transform_timeline_candidates(
return candidates_by_obj_id return candidates_by_obj_id
def map_obj_texts(
def map_obj_id_to_texts(
data: DataFrame, data: DataFrame,
obj_ids: Iterable[ObjectID], feature_obj_id: str = 'ObjektID',
) -> dict[ObjectID, str]: ) -> dict[ObjectID, str]:
data = data.copy()
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
obj_id_to_text: dict[ObjectID, str] = {} obj_id_to_text: dict[ObjectID, str] = {}
for obj_id in obj_ids: for obj_id in tqdm(obj_ids):
data_per_obj = cast( data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
DataFrame,
data.loc[data['ObjektID']==obj_id]
)
# just take first entry # just take first entry
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0]) obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
obj_text = obj_text.strip(r' ,.:') obj_text = obj_text.strip(r' ,.:')
obj_id_to_text[obj_id] = obj_text obj_id_to_text[obj_id] = obj_text
return obj_id_to_text return obj_id_to_text
def get_timeline_candidates(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
model: SentenceTransformer,
cos_sim_threshold: float,
feature_obj_id: str = 'ObjektID',
model_input_feature: str = 'nlp_model_input',
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
logger.info('Obtaining timeline candidates...')
candidates = get_timeline_candidates_index(
data=data,
num_activities_per_obj_id=num_activities_per_obj_id,
model=model,
cos_sim_threshold=cos_sim_threshold,
feature_obj_id=feature_obj_id,
model_input_feature=model_input_feature,
)
tl_candidates = transform_timeline_candidates(candidates)
logger.info('Timeline candidates obtained successfully.')
# text mapping to obtain object descriptors
logger.info('Mapping ObjectIDs to their respective text descriptor...')
map_obj_text = map_obj_id_to_texts(
data=data,
feature_obj_id=feature_obj_id,
)
logger.info('ObjectIDs successfully mapped to text descriptors.')
return tl_candidates, map_obj_text

View File

@ -1,21 +1,20 @@
from typing import cast
import re import re
from itertools import combinations
from collections.abc import Iterator from collections.abc import Iterator
from itertools import combinations
from typing import cast
from dateutil.parser import parse from dateutil.parser import parse
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.lang.de import German as GermanSpacyModel
from pandas import DataFrame from pandas import DataFrame
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from spacy.tokens.token import Token as SpacyToken
from tqdm.auto import tqdm from tqdm.auto import tqdm
from lang_main.loggers import logger_token_analysis as logger
from lang_main.analysis.graphs import ( from lang_main.analysis.graphs import (
update_graph,
TokenGraph, TokenGraph,
update_graph,
) )
from lang_main.loggers import logger_token_analysis as logger
# ** Logging # ** Logging
# LOGGING_LEVEL = 'INFO' # LOGGING_LEVEL = 'INFO'
@ -38,13 +37,14 @@ TAG_OF_INTEREST: frozenset[str] = frozenset()
# ** obtaining connection in texts # ** obtaining connection in texts
def pre_clean_word(string: str) -> str:
def pre_clean_word(string: str) -> str:
pattern = r'[^A-Za-zäöüÄÖÜ]+' pattern = r'[^A-Za-zäöüÄÖÜ]+'
string = re.sub(pattern, '', string) string = re.sub(pattern, '', string)
return string return string
# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format # https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
def is_str_date( def is_str_date(
string: str, string: str,
@ -67,10 +67,10 @@ def is_str_date(
except ValueError: except ValueError:
return False return False
def obtain_relevant_descendants( def obtain_relevant_descendants(
token: SpacyToken, token: SpacyToken,
) -> Iterator[SpacyToken]: ) -> Iterator[SpacyToken]:
for descendant in token.subtree: for descendant in token.subtree:
# subtrees contain the token itself # subtrees contain the token itself
# if current element is token skip this element # if current element is token skip this element
@ -81,12 +81,17 @@ def obtain_relevant_descendants(
if is_str_date(string=descendant.text): if is_str_date(string=descendant.text):
continue continue
logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant " logger.debug(
f">>{descendant}<<, POS >>{descendant.pos_}<<")) (
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
)
)
# eliminate cases of cross-references with verbs # eliminate cases of cross-references with verbs
if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
(descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')): descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
):
continue continue
# skip cases in which descendant is indirect POS with others than verbs # skip cases in which descendant is indirect POS with others than verbs
elif descendant.pos_ in POS_INDIRECT: elif descendant.pos_ in POS_INDIRECT:
@ -99,6 +104,7 @@ def obtain_relevant_descendants(
# TODO look at results and fine-tune function accordingly # TODO look at results and fine-tune function accordingly
def add_doc_info_to_graph( def add_doc_info_to_graph(
graph: TokenGraph, graph: TokenGraph,
doc: SpacyDoc, doc: SpacyDoc,
@ -124,7 +130,7 @@ def add_doc_info_to_graph(
graph=graph, graph=graph,
parent=token.lemma_, parent=token.lemma_,
child=descendant.lemma_, child=descendant.lemma_,
weight_connection=weight weight_connection=weight,
) )
else: else:
# if indirect POS, make connection between all associated words # if indirect POS, make connection between all associated words
@ -139,6 +145,7 @@ def add_doc_info_to_graph(
weight_connection=weight, weight_connection=weight,
) )
def build_token_graph( def build_token_graph(
data: DataFrame, data: DataFrame,
model: GermanSpacyModel, model: GermanSpacyModel,

View File

@ -0,0 +1,55 @@
from pathlib import Path
from typing import Final
from lang_main import CONFIG
# ** paths
INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs'])
SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results'])
PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset'])
# ** control
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** export
# ** preprocessing
FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][
'filename_cossim_filter_candidates'
]
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
'threshold_amount_characters'
]
THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
# ** token analysis
# ** graph postprocessing
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
# ** time analysis.uniqueness
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
'threshold_unique_texts'
]
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
'criterion_feature'
]
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
# ** time_analysis.model_input
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
CONFIG['time_analysis']['model_input']['input_features']
)
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
CONFIG['time_analysis']['model_input']['activity_types']
)
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
'threshold_num_acitivities'
]
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
'threshold_similarity'
]

View File

@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -1,5 +1,5 @@
from typing import Final
import logging import logging
from typing import Final
from lang_main.types import LoggingLevels from lang_main.types import LoggingLevels

View File

@ -1,20 +1,18 @@
from typing import Any
#from types import FunctionType
import sys
import logging
from collections.abc import Callable from collections.abc import Callable
from pathlib import Path from pathlib import Path
from typing import Any
from lang_main.loggers import logger_pipelines as logger from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import save_pickle, load_pickle from lang_main.shared import load_pickle, save_pickle
# ** pipelines to perform given actions on dataset in a customisable manner # ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception): class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline""" """Error describing that no action is available in the current pipeline"""
class BasePipeline():
class BasePipeline:
def __init__( def __init__(
self, self,
name: str, name: str,
@ -27,6 +25,8 @@ class BasePipeline():
self.name = name self.name = name
# working directory for pipeline == output path # working directory for pipeline == output path
self.working_dir = working_dir self.working_dir = working_dir
# if not self.working_dir.exists():
# self.working_dir.mkdir(parents=True)
# container for actions to perform during pass # container for actions to perform during pass
self.actions: list[Callable] = [] self.actions: list[Callable] = []
@ -39,8 +39,10 @@ class BasePipeline():
self._intermediate_result: Any | None = None self._intermediate_result: Any | None = None
def __repr__(self) -> str: def __repr__(self) -> str:
return (f"{self.__class__.__name__}(name: {self.name}, " return (
f"working dir: {self.working_dir}, contents: {self.action_names})") f'{self.__class__.__name__}(name: {self.name}, '
f'working dir: {self.working_dir}, contents: {self.action_names})'
)
@property @property
def intermediate_result(self) -> Any: def intermediate_result(self) -> Any:
@ -60,8 +62,9 @@ class BasePipeline():
self.actions_kwargs.append(action_kwargs.copy()) self.actions_kwargs.append(action_kwargs.copy())
self.is_save_result.append(save_result) self.is_save_result.append(save_result)
else: else:
raise TypeError(("Action must be custom function, " raise TypeError(
f"but is of type >>{type(action)}<<.")) f'Action must be custom function, but is of type >>{type(action)}<<.'
)
# TODO: add multiple entries by utilising simple add method # TODO: add multiple entries by utilising simple add method
""" """
@ -107,13 +110,14 @@ class BasePipeline():
return data return data
def prep_run(self) -> None: def prep_run(self) -> None:
logger.info(f"Starting processing pipeline >>{self.name}<<...") logger.info(f'Starting processing pipeline >>{self.name}<<...')
# progress tracking # progress tracking
self.curr_proc_idx = 1 self.curr_proc_idx = 1
# check if performable actions available # check if performable actions available
if len(self.actions) == 0: if len(self.actions) == 0:
raise NoPerformableActionError(("The pipeline does not contain any " raise NoPerformableActionError(
"performable actions.")) ('The pipeline does not contain any ' 'performable actions.')
)
def run( def run(
self, self,
@ -135,6 +139,6 @@ class BasePipeline():
# processing tracking # processing tracking
self.curr_proc_idx += 1 self.curr_proc_idx += 1
logger.info(f"Processing pipeline >>{self.name}<< successfully ended.") logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
return ret return ret

View File

@ -1,57 +1,144 @@
from sentence_transformers import SentenceTransformer
import spacy import spacy
from sentence_transformers import SentenceTransformer
from lang_main import (
SAVE_PATH_FOLDER,
DATE_COLS,
FILENAME_COSSIM_FILTER_CANDIDATES,
THRESHOLD_SIMILARITY,
)
from lang_main.pipelines.base import BasePipeline
from lang_main.analysis.preprocessing import ( from lang_main.analysis.preprocessing import (
load_raw_data, analyse_feature,
remove_duplicates,
remove_NA,
clean_string_slim, clean_string_slim,
entry_wise_cleansing, entry_wise_cleansing,
analyse_feature, load_raw_data,
build_cosSim_matrix,
filt_thresh_cosSim_matrix,
list_cosSim_dupl_candidates,
merge_similarity_dupl, merge_similarity_dupl,
remove_duplicates,
remove_NA,
)
from lang_main.analysis.timeline import (
filter_activities_per_obj_id,
generate_model_input,
get_timeline_candidates,
remove_non_relevant_obj_ids,
) )
from lang_main.analysis.tokens import build_token_graph from lang_main.analysis.tokens import build_token_graph
from lang_main.constants import (
ACTIVITY_FEATURE,
ACTIVITY_TYPES,
DATE_COLS,
FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES,
SAVE_PATH_FOLDER,
THRESHOLD_NUM_ACTIVITIES,
THRESHOLD_SIMILARITY,
THRESHOLD_TIMELINE_SIMILARITY,
THRESHOLD_UNIQUE_TEXTS,
UNIQUE_CRITERION_FEATURE,
)
from lang_main.pipelines.base import BasePipeline
# ** pipeline configuration # ** pipeline configuration
# ** target feature preparation # ** target feature preparation
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER) pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS}) pipe_target_feat.add(
load_raw_data,
{
'date_cols': DATE_COLS,
},
)
pipe_target_feat.add(remove_duplicates) pipe_target_feat.add(remove_duplicates)
pipe_target_feat.add(remove_NA, save_result=True) pipe_target_feat.add(remove_NA, save_result=True)
pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim}) pipe_target_feat.add(
pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True) entry_wise_cleansing,
{
'target_feature': 'VorgangsBeschreibung',
'cleansing_func': clean_string_slim,
},
)
pipe_target_feat.add(
analyse_feature,
{
'target_feature': 'VorgangsBeschreibung',
},
save_result=True,
)
# output: DataFrame containing target feature with # output: DataFrame containing target feature with
# number of occurrences and associated ObjectIDs # number of occurrences and associated ObjectIDs
# ** embedding pipe # ** embedding pipe
# ?? still needed?
# using similarity between entries to catch duplicates with typo or similar content # using similarity between entries to catch duplicates with typo or similar content
pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER) # pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER)
model_spacy = spacy.load('de_dep_news_trf') model_spacy = spacy.load('de_dep_news_trf')
model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True) # pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True)
pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True) # pipe_embds.add(
pipe_embds.add( # filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True
list_cosSim_dupl_candidates, # )
{'save_candidates': True, # pipe_embds.add(
'saving_path': SAVE_PATH_FOLDER, # list_cosSim_dupl_candidates,
'filename': FILENAME_COSSIM_FILTER_CANDIDATES, # {
'pipeline': pipe_embds}, save_result=True) # 'save_candidates': True,
# 'saving_path': SAVE_PATH_FOLDER,
# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES,
# 'pipeline': pipe_embds,
# },
# save_result=True,
# )
# ** Merge duplicates # ** Merge duplicates
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER) pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
pipe_merge.add(merge_similarity_dupl, save_result=True) # pipe_merge.add(merge_similarity_dupl, save_result=True)
pipe_merge.add(
merge_similarity_dupl,
{
'model': model_stfr,
'cos_sim_threshold': THRESHOLD_SIMILARITY,
},
save_result=True,
)
# ** token analysis # ** token analysis
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER) pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True) pipe_token_analysis.add(
build_token_graph,
{
'model': model_spacy,
},
save_result=True,
)
# ** timeline analysis
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_timeline.add(
remove_non_relevant_obj_ids,
{
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
},
save_result=True,
)
pipe_timeline.add(
generate_model_input,
{
'target_feature_name': 'nlp_model_input',
'model_input_features': MODEL_INPUT_FEATURES,
},
)
pipe_timeline.add(
filter_activities_per_obj_id,
{
'activity_feature': ACTIVITY_FEATURE,
'relevant_activity_types': ACTIVITY_TYPES,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
},
)
pipe_timeline.add(
get_timeline_candidates,
{
'model': model_stfr,
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
'feature_obj_id': FEATURE_NAME_OBJ_ID,
'model_input_feature': 'nlp_model_input',
},
save_result=True,
)

View File

@ -1,38 +1,47 @@
from typing import Any
import os import os
import shutil
import pickle import pickle
import shutil
import tomllib import tomllib
from pathlib import Path from pathlib import Path
from typing import Any
from lang_main.loggers import logger_shared_helpers as logger from lang_main.loggers import logger_shared_helpers as logger
# ** Lib # ** Lib
def create_saving_folder( def create_saving_folder(
saving_path_folder: str | Path, saving_path_folder: str | Path,
overwrite_existing: bool = False, overwrite_existing: bool = False,
) -> None: ) -> None:
# check for existence of given path # check for existence of given path
if not os.path.exists(saving_path_folder): if isinstance(saving_path_folder, str):
os.makedirs(saving_path_folder) saving_path_folder = Path(saving_path_folder)
if not saving_path_folder.exists():
saving_path_folder.mkdir(parents=True)
else: else:
if overwrite_existing: if overwrite_existing:
# overwrite if desired (deletes whole path and re-creates it) # overwrite if desired (deletes whole path and re-creates it)
shutil.rmtree(saving_path_folder) shutil.rmtree(saving_path_folder)
os.makedirs(saving_path_folder) os.makedirs(saving_path_folder)
else: else:
logger.info((f"Path >>{saving_path_folder}<< already exists and remained " logger.info(
"unchanged. If you want to overwrite this path, use parameter " (
">>overwrite_existing<<.")) f'Path >>{saving_path_folder}<< already exists and remained '
f'unchanged. If you want to overwrite this path, use parameter '
f'>>overwrite_existing<<.'
)
)
def load_toml_config( def load_toml_config(
path_to_toml: str | Path, path_to_toml: str | Path,
) -> dict[str, Any]: ) -> dict[str, Any]:
with open(path_to_toml, "rb") as f: with open(path_to_toml, 'rb') as f:
data = tomllib.load(f) data = tomllib.load(f)
logger.info("Loaded TOML config file successfully.") logger.info('Loaded TOML config file successfully.')
return data return data
# saving and loading using pickle # saving and loading using pickle
# careful: pickling from unknown sources can be dangerous # careful: pickling from unknown sources can be dangerous
def save_pickle( def save_pickle(
@ -41,16 +50,18 @@ def save_pickle(
) -> None: ) -> None:
with open(path, 'wb') as file: with open(path, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(f"Saved file successfully under {path}") logger.info(f'Saved file successfully under {path}')
def load_pickle( def load_pickle(
path: str | Path, path: str | Path,
) -> Any: ) -> Any:
with open(path, 'rb') as file: with open(path, 'rb') as file:
obj = pickle.load(file) obj = pickle.load(file)
logger.info("Loaded file successfully.") logger.info('Loaded file successfully.')
return obj return obj
# TODO: remove, too specialised for common application # TODO: remove, too specialised for common application
""" """
def filter_candidates_idx( def filter_candidates_idx(

View File

@ -1,4 +1,4 @@
from typing import TypeAlias, Literal from typing import Literal, TypeAlias
import numpy as np import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc from spacy.tokens.doc import Doc as SpacyDoc

View File

@ -13,29 +13,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 2,
"id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45", "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "ModuleNotFoundError",
"output_type": "stream", "evalue": "No module named 'ihm_analyse'",
"text": [ "output_type": "error",
"INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n" "traceback": [
] "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
}, "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
{ "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n",
"name": "stderr", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'"
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
] ]
} }
], ],
"source": [ "source": [
"from ihm_analyse import CONFIG\n", "from lang_main import CONFIG\n",
"from ihm_analyse.lib.preprocess import (\n", "from lang_main.lib.preprocess import (\n",
" load_raw_data,\n", " load_raw_data,\n",
" remove_duplicates,\n", " remove_duplicates,\n",
" remove_NA,\n", " remove_NA,\n",
@ -47,8 +43,8 @@
" list_cosSim_dupl_candidates,\n", " list_cosSim_dupl_candidates,\n",
" merge_similarity_dupl,\n", " merge_similarity_dupl,\n",
")\n", ")\n",
"from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n", "from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n",
"from ihm_analyse.lib.helpers import (\n", "from lang_main.lib.helpers import (\n",
" save_pickle, \n", " save_pickle, \n",
" load_pickle, \n", " load_pickle, \n",
" create_saving_folder,\n", " create_saving_folder,\n",

View File

@ -1,28 +1,42 @@
from typing import cast from typing import cast
from pathlib import Path
import pandas as pd
import plotly.express as px
from dash import ( from dash import (
Dash, Dash,
html,
dcc,
callback,
Output,
Input, Input,
Output,
State, State,
callback,
dash_table, dash_table,
dcc,
html,
) )
import plotly.express as px
import pandas as pd
from pandas import DataFrame
from lang_main import load_pickle from lang_main import load_pickle
from lang_main.types import TimelineCandidates, ObjectID from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data # ** data
data = cast(DataFrame, load_pickle('./data.pkl')) p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl')) p_tl = Path(
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl')) r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
)
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
cands = ret[0]
texts = ret[1]
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
# data = cast(DataFrame, load_pickle(p_df))
# cands = cast(TimelineCandidates, load_pickle(p_cands))
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
table_feats = [ table_feats = [
'ErstellungsDatum', 'ErstellungsDatum',
'ErledigungsDatum', 'ErledigungsDatum',
@ -53,23 +67,26 @@ app = Dash(prevent_initial_callbacks=True)
app.layout = [ app.layout = [
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}), html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
html.Div(children=[ html.Div(
children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'), html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown( dcc.Dropdown(
list(cands.keys()), list(cands.keys()),
id='dropdown-selection', id='dropdown-selection',
placeholder="ObjektID auswählen...", placeholder='ObjektID auswählen...',
) ),
]), ]
html.Div(children=[ ),
html.Div(
children=[
html.H3(id='object_text'), html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'), dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'), dcc.Graph(id='graph-output'),
]),
html.Div(children=[
dash_table.DataTable(id='table-candidates')
]),
] ]
),
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
]
@callback( @callback(
Output('object_text', 'children'), Output('object_text', 'children'),
@ -82,6 +99,7 @@ def update_obj_text(obj_id):
headline = f'HObjektText: {obj_text}' headline = f'HObjektText: {obj_text}'
return headline return headline
@callback( @callback(
Output('choice-candidates', 'options'), Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'), Input('dropdown-selection', 'value'),
@ -93,6 +111,7 @@ def update_choice_candidates(obj_id):
choices = list(range(1, len(cands_obj_id) + 1)) choices = list(range(1, len(cands_obj_id) + 1))
return choices return choices
@callback( @callback(
Output('graph-output', 'figure'), Output('graph-output', 'figure'),
Input('choice-candidates', 'value'), Input('choice-candidates', 'value'),
@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
title=title, title=title,
hover_data=hover_data, hover_data=hover_data,
) )
fig.update_traces( fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
mode='markers+lines',
marker=markers,
marker_symbol='diamond'
)
fig.update_xaxes( fig.update_xaxes(
tickformat="%B\n%Y", tickformat='%B\n%Y',
rangeslider_visible=True, rangeslider_visible=True,
) )
fig.update_yaxes(type='category') fig.update_yaxes(type='category')
fig.update_layout(hovermode="x unified") fig.update_layout(hovermode='x unified')
return fig return fig
@callback( @callback(
[Output('table-candidates', 'data'), [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'), Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'), State('dropdown-selection', 'value'),
prevent_initial_call=True, prevent_initial_call=True,
@ -144,10 +159,10 @@ def update_table_candidates(index, obj_id):
cands_choice = cands_obj_id[int(index) - 1] cands_choice = cands_obj_id[int(index) - 1]
# data # data
df = data.loc[list(cands_choice)].sort_index() df = data.loc[list(cands_choice)].sort_index()
df = (df df = df.filter(items=table_feats, axis=1).sort_values(
.filter(items=table_feats, axis=1) by='ErstellungsDatum', ascending=True
.sort_values(by='ErstellungsDatum', ascending=True)) )
cols = [{"name": i, "id": i} for i in df.columns] cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings # convert dates to strings
for col in table_feats_dates: for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d') df[col] = df[col].dt.strftime(r'%Y-%m-%d')
@ -155,5 +170,6 @@ def update_table_candidates(index, obj_id):
table_data = df.to_dict('records') table_data = df.to_dict('records')
return table_data, cols return table_data, cols
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug=True) app.run(debug=True)

View File

@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -0,0 +1,663 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "3760b040-985c-46ec-ba77-13f0f7a52c83",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from lang_main import load_pickle"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86",
"metadata": {},
"outputs": [],
"source": [
"def get_files(path: str) -> tuple[Path, ...]:\n",
" p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
" assert p.exists(), \"path does not exist\"\n",
" return tuple(p.glob(r'*'))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "55ad4af3-87cd-4189-9309-171aba4e04a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n"
]
}
],
"source": [
"file = files[-1]\n",
"ret = load_pickle(file)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "540f4720-a2bf-4171-8db5-8e6993d38c13",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>3108</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
" <td>36</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2559</th>\n",
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
" <td>46</td>\n",
" <td>1</td>\n",
" <td>[211]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2558</th>\n",
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
" <td>30</td>\n",
" <td>1</td>\n",
" <td>[93]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2557</th>\n",
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" <td>[1707]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2556</th>\n",
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
" <td>173</td>\n",
" <td>1</td>\n",
" <td>[1]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6782</th>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>106</td>\n",
" <td>2</td>\n",
" <td>[306, 326]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4545 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" entry ... num_assoc_obj_ids\n",
"162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n",
"33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n",
"131 Tägliche Überprüfung der Ölabscheider ... 4\n",
"160 Wöchentliche Kontrolle der C-Anlagen ... 11\n",
"140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n",
"... ... ... ...\n",
"2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n",
"2558 T-Warp-Let-Off1 schleppfehler ... 1\n",
"2557 Fahrräder wurden gewartet und gereinigt. ... 1\n",
"2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n",
"6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n",
"\n",
"[4545 rows x 5 columns]"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "82a059ea-0eb8-4db1-b859-3fc07e42faff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 69,
"id": "d1c1190f-0c80-40e3-8965-78d68400a33d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n"
]
}
],
"source": [
"file = files[-1]\n",
"ret = load_pickle(file)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "beacf5ca-6946-413a-817c-e7e87da9ace3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>entry</th>\n",
" <th>len</th>\n",
" <th>num_occur</th>\n",
" <th>assoc_obj_ids</th>\n",
" <th>num_assoc_obj_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>162</td>\n",
" <td>Tägliche Wartungstätigkeiten nach Vorgabe des ...</td>\n",
" <td>66</td>\n",
" <td>92592</td>\n",
" <td>[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...</td>\n",
" <td>206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>33</td>\n",
" <td>Wöchentliche Sichtkontrolle / Reinigung</td>\n",
" <td>39</td>\n",
" <td>3108</td>\n",
" <td>[301, 304, 305, 313, 314, 323, 329, 331, 332, ...</td>\n",
" <td>74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>131</td>\n",
" <td>Tägliche Überprüfung der Ölabscheider</td>\n",
" <td>37</td>\n",
" <td>1619</td>\n",
" <td>[0, 970, 2134, 2137]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>160</td>\n",
" <td>Wöchentliche Kontrolle der C-Anlagen</td>\n",
" <td>36</td>\n",
" <td>1265</td>\n",
" <td>[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>140</td>\n",
" <td>Halbjährliche Kontrolle des Stabbreithalters</td>\n",
" <td>44</td>\n",
" <td>687</td>\n",
" <td>[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...</td>\n",
" <td>166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6756</th>\n",
" <td>2559</td>\n",
" <td>Fehler 9723 Leistungsversorgung Antrieb defekt</td>\n",
" <td>46</td>\n",
" <td>1</td>\n",
" <td>[211]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6757</th>\n",
" <td>2558</td>\n",
" <td>T-Warp-Let-Off1 schleppfehler</td>\n",
" <td>30</td>\n",
" <td>1</td>\n",
" <td>[93]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6758</th>\n",
" <td>2557</td>\n",
" <td>Fahrräder wurden gewartet und gereinigt.</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" <td>[1707]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6759</th>\n",
" <td>2556</td>\n",
" <td>Bohrlöcher an Gebots- und Verbotszeichen anbri...</td>\n",
" <td>173</td>\n",
" <td>1</td>\n",
" <td>[1]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6760</th>\n",
" <td>6782</td>\n",
" <td>Befestigung Deckel für Batteriefach defekt ...</td>\n",
" <td>106</td>\n",
" <td>2</td>\n",
" <td>[306, 326]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4545 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" index ... num_assoc_obj_ids\n",
"0 162 ... 206\n",
"1 33 ... 74\n",
"2 131 ... 4\n",
"3 160 ... 11\n",
"4 140 ... 166\n",
"... ... ... ...\n",
"6756 2559 ... 1\n",
"6757 2558 ... 1\n",
"6758 2557 ... 1\n",
"6759 2556 ... 1\n",
"6760 6782 ... 2\n",
"\n",
"[4545 rows x 6 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 72,
"id": "cbf0b450-ec00-471f-9627-717e52c5471d",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.auto import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807",
"metadata": {},
"outputs": [],
"source": [
"tup = tuple(i for i in range(100000000))"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ade9c6f4e61410fb93f35e43222705b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/100000000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"num = 0\n",
"for i in tqdm(tup):\n",
" num += i"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "64cd6cc7-2803-41f1-b05c-83d65bdc7d42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4999999950000000"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36366147-3632-4518-936e-878563305e49",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 30,
"id": "4dbc00b8-1437-4986-85e4-645a8bcf4a6d",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "17156aa0-8fd6-407b-b014-698df0e534a9",
"metadata": {},
"outputs": [],
"source": [
"arr = np.random.rand(1000,1000)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "4292a60b-9cb2-42d9-bedf-3b1120f1b515",
"metadata": {},
"outputs": [],
"source": [
"idx = np.argwhere(arr >= 0.97)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30220"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(idx)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "5b78436e-a828-42bd-a5ed-ae6045349391",
"metadata": {},
"outputs": [],
"source": [
"batch = idx[:200]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "75edc50e-b64c-4319-8f74-27653ed3452c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"tuple(map(tuple, batch))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"tuple(tuple(x) for x in batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}