diff --git a/pyproject.toml b/pyproject.toml index 7218e7c..fd2ab1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,3 +34,15 @@ trials = [ "plotly>=5.22.0", "dash>=2.17.0", ] + +[tool.ruff] +line-length = 94 +indent-width = 4 +target-version = "py311" + +[tool.ruff.format] +quote-style = "single" +skip-magic-trailing-comma = false + +[tool.ruff.lint] +select = ["E", "F", "I"] \ No newline at end of file diff --git a/scripts/analyse_dataset.py b/scripts/analyse_dataset.py index bd382a5..2316ede 100644 --- a/scripts/analyse_dataset.py +++ b/scripts/analyse_dataset.py @@ -1,33 +1,43 @@ import typing +import warnings +from pathlib import Path +from typing import cast -from pandas import DataFrame, Series - -from ihm_analyse import ( - SAVE_PATH_FOLDER, - PATH_TO_DATASET, - THRESHOLD_AMOUNT_CHARACTERS, - THRESHOLD_EDGE_WEIGHT, - DO_PREPROCESSING, - DO_TOKEN_ANALYSIS, - DO_GRAPH_POSTPROCESSING, +from lang_main import ( + TokenGraph, create_saving_folder, load_pickle, - Embedding, - Index, - TokenGraph, ) -from ihm_analyse.predefined_pipes import ( - pipe_target_feat, - pipe_embds, +from lang_main.constants import ( + DO_GRAPH_POSTPROCESSING, + DO_PREPROCESSING, + DO_TIME_ANALYSIS, + DO_TOKEN_ANALYSIS, + INPUT_PATH_FOLDER, + PATH_TO_DATASET, + SAVE_PATH_FOLDER, + SKIP_GRAPH_POSTPROCESSING, + SKIP_PREPROCESSING, + SKIP_TIME_ANALYSIS, + SKIP_TOKEN_ANALYSIS, + THRESHOLD_AMOUNT_CHARACTERS, + THRESHOLD_EDGE_WEIGHT, +) + +# Embedding, +# PandasIndex, +from lang_main.pipelines.predefined import ( pipe_merge, + pipe_target_feat, + pipe_timeline, pipe_token_analysis, ) -""" -# ** config parameters -SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) -PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset']) -THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess']['threshold_amount_characters'] -""" +from lang_main.types import ( + ObjectID, + TimelineCandidates, +) +from pandas import DataFrame, Series + # ** processing pipeline def run_preprocessing() -> DataFrame: @@ -36,80 +46,147 @@ def run_preprocessing() -> DataFrame: overwrite_existing=True, ) # run pipelines - ret = typing.cast(tuple[DataFrame], - pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))) + ret = typing.cast( + tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,)) + ) target_feat_data = ret[0] # only entries with more than threshold amount of characters - data_filter = typing.cast(Series, - (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) - subset_data = target_feat_data.loc[data_filter, 'entry'].copy() - dupl_idx_pairs, embds = typing.cast( - tuple[list[tuple[Index, Index]], dict[int, tuple[Embedding, str]]], - pipe_embds.run(starting_values=(subset_data,)) - ) + data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) + # subset_data = target_feat_data.loc[data_filter, 'entry'].copy() + # dupl_idx_pairs, embds = typing.cast( + # tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]], + # pipe_embds.run(starting_values=(subset_data,)), + # ) # merge duplicates, results saved separately - ret = typing.cast(tuple[DataFrame], - pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs))) + subset_data = target_feat_data.loc[data_filter].copy() + ret = typing.cast( + tuple[DataFrame], + # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)), + pipe_merge.run(starting_values=(subset_data,)), + ) preprocessed_data = ret[0] - + return preprocessed_data + def run_token_analysis( preprocessed_data: DataFrame, ) -> TokenGraph: # build token graph - (tk_graph,) = typing.cast(tuple[TokenGraph], - pipe_token_analysis.run(starting_values=(preprocessed_data,))) + (tk_graph,) = typing.cast( + tuple[TokenGraph], pipe_token_analysis.run(starting_values=(preprocessed_data,)) + ) tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False) - tk_graph.to_pickle(SAVE_PATH_FOLDER, - filename=f'{pipe_token_analysis.name}-TokenGraph') - + tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph') + return tk_graph + def run_graph_postprocessing( tk_graph: TokenGraph, ) -> TokenGraph: # filter graph by edge weight and remove single nodes (no connection) tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT) tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1) - tk_graph_filtered.save_graph(SAVE_PATH_FOLDER, - filename='TokenGraph-filtered', - directed=False) - tk_graph_filtered.to_pickle(SAVE_PATH_FOLDER, - filename=f'{pipe_token_analysis.name}-TokenGraph-filtered') - + tk_graph_filtered.save_graph( + SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False + ) + tk_graph_filtered.to_pickle( + SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered' + ) + return tk_graph_filtered -if __name__ == '__main__': + +def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]: + filename = 'without_nan' + loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') + verify_path(loading_path) + ret = load_pickle(loading_path) + preprocessed_data = ret[0] + + ret = cast( + tuple[TimelineCandidates, dict[ObjectID, str]], + pipe_timeline.run(starting_values=(preprocessed_data,)), + ) + return ret + + +def verify_path( + loading_path: Path, +) -> None: + if not loading_path.exists(): + raise FileNotFoundError(f'Could not load results. File not found: {loading_path}') + + +def main() -> None: + pre_step_skipped: bool = False # ** preprocess - if DO_PREPROCESSING: + if DO_PREPROCESSING and not SKIP_PREPROCESSING: preprocessed_data = run_preprocessing() - else: + elif not SKIP_PREPROCESSING: # !! hardcoded result filenames target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*' - target_filepath = list(SAVE_PATH_FOLDER.glob(target_pattern))[0] - ret = typing.cast(tuple[DataFrame], - load_pickle(target_filepath)) + loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0] + verify_path(loading_path) + ret = typing.cast(tuple[DataFrame], load_pickle(loading_path)) preprocessed_data = ret[0] - # ** token analysis - if DO_TOKEN_ANALYSIS: - preprocessed_data_trunc = typing.cast(DataFrame, - preprocessed_data[['entry', 'num_occur']].copy()) # type: ignore - tk_graph = run_token_analysis(preprocessed_data_trunc) else: + pre_step_skipped = True + warnings.warn('No preprocessing action selected. Skipped.') + # sys.exit(0) + # ** token analysis + if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS: + if pre_step_skipped: + raise RuntimeError( + 'Preprocessing step skipped. Token analysis cannot be performed.' + ) + preprocessed_data_trunc = typing.cast( + DataFrame, preprocessed_data[['entry', 'num_occur']].copy() + ) # type: ignore + tk_graph = run_token_analysis(preprocessed_data_trunc) + elif not SKIP_TOKEN_ANALYSIS: # !! hardcoded result filenames # whole graph filename: str = f'{pipe_token_analysis.name}-TokenGraph' - loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') - #tk_graph = typing.cast(TokenGraph, load_pickle(loading_path)) + loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') + verify_path(loading_path) + # tk_graph = typing.cast(TokenGraph, load_pickle(loading_path)) tk_graph = TokenGraph.from_pickle(loading_path) - # ** graph postprocessing - if DO_GRAPH_POSTPROCESSING: - tk_graph_filtered = run_graph_postprocessing(tk_graph) + pre_step_skipped = False else: + pre_step_skipped = True + warnings.warn('No token analysis action selected. Skipped.') + # ** graph postprocessing + if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING: + if pre_step_skipped: + raise RuntimeError( + ( + 'Preprocessing or token analysis step skipped. ' + 'Graph postprocessing cannot be performed.' + ) + ) + tk_graph_filtered = run_graph_postprocessing(tk_graph) + elif not SKIP_GRAPH_POSTPROCESSING: # !! hardcoded result filenames # filtered graph filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered' - loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pickle') - #tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path)) - tk_graph_filtered = TokenGraph.from_pickle(loading_path) \ No newline at end of file + loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl') + verify_path(loading_path) + # tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path)) + tk_graph_filtered = TokenGraph.from_pickle(loading_path) + pre_step_skipped = False + else: + warnings.warn('No graph postprocessing action selected. Skipped.') + # ** time analysis + if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS: + # no check for fails, runs separately + ret = run_time_analysis() + elif not SKIP_TIME_ANALYSIS: + ... + else: + warnings.warn('No time analysis action selected. Skipped.') + + +if __name__ == '__main__': + main() diff --git a/scripts/inputs/without_nan.pkl b/scripts/inputs/without_nan.pkl new file mode 100644 index 0000000..bba1d89 Binary files /dev/null and b/scripts/inputs/without_nan.pkl differ diff --git a/src/lang_main/config.toml b/scripts/lang_main_config copy.toml similarity index 65% rename from src/lang_main/config.toml rename to scripts/lang_main_config copy.toml index e5f978f..8cf2829 100644 --- a/src/lang_main/config.toml +++ b/scripts/lang_main_config copy.toml @@ -1,17 +1,21 @@ # lang_main: Config file [paths] -results = './results/test_new2/' -dataset = './01_2_Rohdaten_neu/Export4.csv' +inputs = 'A:/Arbeitsaufgaben/lang-main/scripts' +results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/' +dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv' #results = './results/Export7/' #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' #results = './results/Export7_trunc/' #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' [control] -preprocessing = false -token_analysis = true +preprocessing = true +preprocessing_skip = false +token_analysis = false +token_analysis_skip = true graph_postprocessing = false +graph_postprocessing_skip = true #[export_filenames] #filename_cossim_filter_candidates = 'CosSim-FilterCandidates' diff --git a/scripts/lang_main_config.toml b/scripts/lang_main_config.toml new file mode 100644 index 0000000..3d0fdd7 --- /dev/null +++ b/scripts/lang_main_config.toml @@ -0,0 +1,59 @@ +# lang_main: Config file + +[paths] +inputs = 'A:/Arbeitsaufgaben/lang-main/scripts/inputs/' +results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/' +dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv' +#results = './results/Export7/' +#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' +#results = './results/Export7_trunc/' +#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' + +[control] +preprocessing = true +preprocessing_skip = true +token_analysis = false +token_analysis_skip = true +graph_postprocessing = false +graph_postprocessing_skip = true +time_analysis = true +time_analysis_skip = false + +#[export_filenames] +#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' + +[preprocess] +filename_cossim_filter_candidates = 'CosSim-FilterCandidates' +date_cols = [ + "VorgangsDatum", + "ErledigungsDatum", + "Arbeitsbeginn", + "ErstellungsDatum", +] +threshold_amount_characters = 5 +threshold_similarity = 0.8 + +[graph_postprocessing] +threshold_edge_weight = 150 + +[time_analysis.uniqueness] +threshold_unique_texts = 4 +criterion_feature = 'HObjektText' +feature_name_obj_id = 'ObjektID' + +[time_analysis.model_input] +# input_features = [ +# 'VorgangsTypName', +# 'VorgangsArtText', +# 'VorgangsBeschreibung', +# ] +input_features = [ + 'VorgangsBeschreibung', +] +activity_feature = 'VorgangsTypName' +activity_types = [ + 'Reparaturauftrag (Portal)', + 'Störungsmeldung', +] +threshold_num_acitivities = 1 +threshold_similarity = 0.8 \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py new file mode 100644 index 0000000..8076042 --- /dev/null +++ b/scripts/test.py @@ -0,0 +1,12 @@ +from lang_main.analysis.preprocessing import clean_string_slim +from lang_main.constants import SAVE_PATH_FOLDER + +print(SAVE_PATH_FOLDER) +txt = """ +Wir feiern den Jahrestag, olé! +tel:::: !!!!???? +++49 123 456 789 + +Doch leben wir länger. +""" +print(txt) +print(clean_string_slim(txt)) diff --git a/src/lang_main/__init__.py b/src/lang_main/__init__.py index c6ae768..85a218e 100644 --- a/src/lang_main/__init__.py +++ b/src/lang_main/__init__.py @@ -1,18 +1,19 @@ -from typing import Final, Any import inspect -import sys import logging -from time import gmtime +import shutil +import sys from pathlib import Path +from time import gmtime +from typing import Any, Final -from lang_main.shared import ( - save_pickle, - load_pickle, - create_saving_folder, - load_toml_config, -) -from lang_main.analysis.preprocessing import Embedding, PandasIndex from lang_main.analysis.graphs import TokenGraph +from lang_main.analysis.preprocessing import Embedding, PandasIndex +from lang_main.shared import ( + create_saving_folder, + load_pickle, + load_toml_config, + save_pickle, +) __all__ = [ 'save_pickle', @@ -32,37 +33,30 @@ logging.basicConfig( datefmt=LOG_DATE_FMT, ) -USE_INTERNAL_CONFIG: Final[bool] = True +CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' +USE_INTERNAL_CONFIG: Final[bool] = False + +pkg_dir = Path(__file__).parent +cfg_path_internal = pkg_dir / CONFIG_FILENAME # load config data: internal/external if USE_INTERNAL_CONFIG: - curr_file_dir = Path(inspect.getfile(inspect.currentframe())) # type: ignore - pkg_dir = curr_file_dir.parent - config_path = Path(pkg_dir, 'config.toml') - loaded_config = load_toml_config(path_to_toml=config_path) - CONFIG: Final[dict[str, Any]] = loaded_config.copy() + loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal) else: - raise NotImplementedError("External config data not implemented yet.") + caller_file = Path(inspect.stack()[-1].filename) + if not caller_file.exists(): + raise FileNotFoundError('Caller file could not be correctly retrieved.') + cfg_path_external = caller_file.parent / CONFIG_FILENAME + if not cfg_path_external.exists(): + shutil.copy(cfg_path_internal, cfg_path_external) + sys.exit( + ( + 'No config file was found. A new one with default values was created ' + 'in the execution path. Please fill in the necessary values and ' + 'restart the programm.' + ) + ) + # raise NotImplementedError("External config data not implemented yet.") + loaded_cfg = load_toml_config(path_to_toml=cfg_path_external) -# ** paths -SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) -PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset']) -# ** control -DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing'] -DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis'] -DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing'] -# ** export - -# ** preprocessing -FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] =\ - CONFIG['preprocess']['filename_cossim_filter_candidates'] -DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols'] -THRESHOLD_AMOUNT_CHARACTERS: Final[float] =\ - CONFIG['preprocess']['threshold_amount_characters'] -THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'] -# ** token analysis - -# ** graph postprocessing -THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight'] -# ** time analysis -THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['threshold_unique_texts'] +CONFIG: Final[dict[str, Any]] = loaded_cfg.copy() diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py index 0c524a2..dd74ebc 100644 --- a/src/lang_main/analysis/graphs.py +++ b/src/lang_main/analysis/graphs.py @@ -1,18 +1,18 @@ -import typing -from typing import Any, Self, Literal, overload, Final -import sys -from collections.abc import Hashable -from pathlib import Path import copy +import sys +import typing +from collections.abc import Hashable, Iterable +from pathlib import Path +from typing import Any, Final, Literal, Self, overload +import networkx as nx import numpy as np import numpy.typing as npt -from networkx import Graph, DiGraph -import networkx as nx +from networkx import DiGraph, Graph from pandas import DataFrame from lang_main.loggers import logger_graphs as logger -from lang_main.shared import save_pickle, load_pickle +from lang_main.shared import load_pickle, save_pickle # TODO change logging behaviour, add logging to file LOGGING_DEFAULT: Final[bool] = False @@ -31,18 +31,17 @@ def get_graph_metadata( min_edge_weight: int = 1_000_000 max_edge_weight: int = 0 for edge in graph.edges: - weight = typing.cast(int, - graph[edge[0]][edge[1]]['weight']) + weight = typing.cast(int, graph[edge[0]][edge[1]]['weight']) if weight < min_edge_weight: min_edge_weight = weight if weight > max_edge_weight: max_edge_weight = weight - + # memory edge_mem = sum([sys.getsizeof(e) for e in graph.edges]) node_mem = sum([sys.getsizeof(n) for n in graph.nodes]) total_mem = edge_mem + node_mem - + graph_info.update( num_nodes=num_nodes, num_edges=num_edges, @@ -52,20 +51,22 @@ def get_graph_metadata( edge_memory=edge_mem, total_memory=total_mem, ) - + if logging: - logger.info((f"Graph properties: {num_nodes} Nodes, " - f"{num_edges} Edges")) - logger.info(f"Node memory: {node_mem / 1024:.2f} KB") - logger.info(f"Edge memory: {edge_mem / 1024:.2f} KB") - logger.info(f"Total memory: {total_mem / 1024:.2f} KB") - + logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges')) + logger.info(f'Node memory: {node_mem / 1024:.2f} KB') + logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB') + logger.info(f'Total memory: {total_mem / 1024:.2f} KB') + return graph_info + def update_graph( graph: Graph | DiGraph, - parent: Hashable, - child: Hashable, + *, + batch: Iterable[tuple[Hashable, Hashable]] | None = None, + parent: Hashable | None = None, + child: Hashable | None = None, weight_connection: int = 1, ) -> None: # !! not necessary to check for existence of nodes @@ -78,7 +79,9 @@ def update_graph( graph.add_node(child) """ # check if edge not in Graph - if not graph.has_edge(parent, child): + if batch is not None: + graph.add_edges_from(batch, weight=weight_connection) + elif not graph.has_edge(parent, child): # create new edge, nodes will be created if not already present graph.add_edge(parent, child, weight=weight_connection) else: @@ -87,40 +90,38 @@ def update_graph( weight += weight_connection graph[parent][child]['weight'] = weight + # build undirected adjacency matrix def convert_graph_to_undirected( graph: DiGraph, logging: bool = LOGGING_DEFAULT, ) -> Graph: # get adjacency matrix - adj_mat = typing.cast(DataFrame, - nx.to_pandas_adjacency(G=graph, dtype=np.uint32)) - arr = typing.cast(npt.NDArray[np.uint32], - adj_mat.to_numpy()) + adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32)) + arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy()) # build undirected array: adding edges of lower triangular matrix to upper one arr_upper = np.triu(arr) arr_lower = np.tril(arr) arr_lower = np.rot90(np.fliplr(arr_lower)) arr_new = arr_upper + arr_lower # assign new data and create graph - adj_mat.loc[:] = arr_new # type: ignore - graph_undir = typing.cast(Graph, - nx.from_pandas_adjacency(df=adj_mat)) - + adj_mat.loc[:] = arr_new # type: ignore + graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat)) + # info about graph if logging: - logger.info("Successfully converted graph to one with undirected edges.") + logger.info('Successfully converted graph to one with undirected edges.') _ = get_graph_metadata(graph=graph_undir, logging=logging) - + return graph_undir + class TokenGraph(DiGraph): - def __init__( self, name: str = 'TokenGraph', enable_logging: bool = True, - incoming_graph_data: Any| None = None, + incoming_graph_data: Any | None = None, **attr, ) -> None: super().__init__(incoming_graph_data, **attr) @@ -133,15 +134,17 @@ class TokenGraph(DiGraph): self._metadata_directed: dict[str, int] = {} self._undirected: Graph | None = None self._metadata_undirected: dict[str, int] = {} - + def __repr__(self) -> str: return self.__str__() - + def __str__(self) -> str: - return (f"TokenGraph(name: {self.name}, number of nodes: " - f"{len(self.nodes)}, number of edges: " - f"{len(self.edges)})") - + return ( + f'TokenGraph(name: {self.name}, number of nodes: ' + f'{len(self.nodes)}, number of edges: ' + f'{len(self.edges)})' + ) + # !! only used to verify that saving was done correctly """ def __key(self) -> tuple[Hashable, ...]: @@ -150,7 +153,7 @@ class TokenGraph(DiGraph): def __hash__(self) -> int: return hash(self.__key()) """ - + def copy(self) -> Self: """returns a (deep) copy of the graph @@ -160,51 +163,46 @@ class TokenGraph(DiGraph): deep copy of the graph """ return copy.deepcopy(self) - + @property def name(self) -> str: return self._name - + @property def directed(self) -> Self: return self._directed - + @property def undirected(self) -> Graph | None: return self._undirected - + @property def metadata_directed(self) -> dict[str, int]: return self._metadata_directed - + @property def metadata_undirected(self) -> dict[str, int]: return self._metadata_undirected - + @overload def to_undirected( - self, + self, inplace: Literal[True] = ..., logging: bool | None = ..., - ) -> None: - ... - + ) -> None: ... + @overload def to_undirected( - self, + self, inplace: Literal[False], logging: bool | None = ..., - ) -> Graph: - ... - + ) -> Graph: ... + @overload def to_undirected( - self, - inplace: bool = ..., - logging: bool | None = ... - ) -> Graph | None: - ... - + self, inplace: bool = ..., logging: bool | None = ... + ) -> Graph | None: ... + def to_undirected( self, inplace=True, @@ -212,27 +210,27 @@ class TokenGraph(DiGraph): ) -> Graph | None: if logging is None: logging = self.logging - - self._undirected = convert_graph_to_undirected(graph=self, - logging=logging) - self._metadata_undirected = get_graph_metadata(graph=self._undirected, - logging=logging) + + self._undirected = convert_graph_to_undirected(graph=self, logging=logging) + self._metadata_undirected = get_graph_metadata( + graph=self._undirected, logging=logging + ) if not inplace: return self._undirected - + def update_metadata( self, logging: bool | None = None, ) -> None: if logging is None: logging = self.logging - - self._metadata_directed = get_graph_metadata(graph=self, - logging=logging) + + self._metadata_directed = get_graph_metadata(graph=self, logging=logging) if self._undirected is not None: - self._metadata_undirected = get_graph_metadata(graph=self._undirected, - logging=logging) - + self._metadata_undirected = get_graph_metadata( + graph=self._undirected, logging=logging + ) + def filter_by_edge_weight( self, threshold: int, @@ -252,20 +250,19 @@ class TokenGraph(DiGraph): # filter edges by weight original_graph_edges = copy.deepcopy(self.edges) filtered_graph = self.copy() - + for edge in original_graph_edges: - weight = typing.cast(int, - filtered_graph[edge[0]][edge[1]]['weight']) + weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight']) if weight < threshold: filtered_graph.remove_edge(edge[0], edge[1]) - + if filtered_graph._undirected is not None: filtered_graph.to_undirected(inplace=True, logging=False) - + filtered_graph.update_metadata(logging=False) - + return filtered_graph - + def filter_by_node_degree( self, threshold: int, @@ -285,31 +282,31 @@ class TokenGraph(DiGraph): # filter nodes by degree original_graph_nodes = copy.deepcopy(self.nodes) filtered_graph = self.copy() - + for node in original_graph_nodes: - degree = filtered_graph.degree[node] # type: ignore + degree = filtered_graph.degree[node] # type: ignore if degree < threshold: filtered_graph.remove_node(node) - + if filtered_graph._undirected is not None: filtered_graph.to_undirected(inplace=True, logging=False) - + filtered_graph.update_metadata(logging=False) - + return filtered_graph - + def _save_prepare( self, path: Path, filename: str | None = None, ) -> Path: if filename is not None: - saving_path = path.joinpath(f"{filename}") + saving_path = path.joinpath(f'{filename}') else: - saving_path = path.joinpath(f"{self.name}") - + saving_path = path.joinpath(f'{self.name}') + return saving_path - + def save_graph( self, path: Path, @@ -335,19 +332,18 @@ class TokenGraph(DiGraph): undirected graph should be exported but is not available """ saving_path = self._save_prepare(path=path, filename=filename) - + if directed: target_graph = self._directed elif not directed and self._undirected is not None: target_graph = self._undirected else: - raise ValueError("No undirected graph available.") - + raise ValueError('No undirected graph available.') + saving_path = saving_path.with_suffix('.graphml') nx.write_graphml(G=target_graph, path=saving_path) - logger.info(("Successfully saved graph as GraphML file " - f"under {saving_path}.")) - + logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.')) + def to_pickle( self, path: Path, @@ -365,7 +361,7 @@ class TokenGraph(DiGraph): saving_path = self._save_prepare(path=path, filename=filename) saving_path = saving_path.with_suffix('.pickle') save_pickle(obj=self, path=saving_path) - + @classmethod def from_file( cls, @@ -378,15 +374,15 @@ class TokenGraph(DiGraph): match path.suffix: case '.graphml': graph = typing.cast(Self, nx.read_graphml(path, node_type=int)) - logger.info(f"Successfully loaded graph from GraphML file {path}.") + logger.info(f'Successfully loaded graph from GraphML file {path}.') case '.pkl' | '.pickle': graph = typing.cast(Self, load_pickle(path)) - logger.info(f"Successfully loaded graph from pickle file {path}.") + logger.info(f'Successfully loaded graph from pickle file {path}.') case _: - raise ValueError("File format not supported.") - + raise ValueError('File format not supported.') + return graph - + @classmethod def from_pickle( cls, @@ -394,10 +390,10 @@ class TokenGraph(DiGraph): ) -> Self: if isinstance(path, str): path = Path(path) - + if path.suffix not in ('.pkl', '.pickle'): - raise ValueError("File format not supported.") - + raise ValueError('File format not supported.') + graph = typing.cast(Self, load_pickle(path)) - - return graph \ No newline at end of file + + return graph diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py index feecfb6..059f6b9 100644 --- a/src/lang_main/analysis/preprocessing.py +++ b/src/lang_main/analysis/preprocessing.py @@ -1,29 +1,29 @@ -from typing import cast, Callable +import re from collections.abc import Iterable from itertools import combinations -import re from math import factorial from pathlib import Path +from typing import Callable, cast import numpy as np -from torch import Tensor -from pandas import DataFrame, Series import pandas as pd -from spacy.lang.de import German as GermanSpacyModel -from spacy.tokens.doc import Doc as SpacyDoc -from sentence_transformers import SentenceTransformer import sentence_transformers import sentence_transformers.util +from pandas import DataFrame, Series +from sentence_transformers import SentenceTransformer +from spacy.lang.de import German as GermanSpacyModel +from spacy.tokens.doc import Doc as SpacyDoc +from torch import Tensor from tqdm import tqdm -from lang_main.types import Embedding, PandasIndex -from lang_main.loggers import logger_preprocess as logger -from lang_main.pipelines.base import BasePipeline from lang_main.analysis.shared import ( + candidates_by_index, similar_index_connection_graph, similar_index_groups, ) -#from lang_main.analysis.graphs import update_graph, get_graph_metadata +from lang_main.loggers import logger_preprocess as logger +from lang_main.pipelines.base import BasePipeline +from lang_main.types import Embedding, PandasIndex # ** (1) dataset preparation: loading and simple preprocessing @@ -45,7 +45,7 @@ def load_raw_data( path : str path to dataset file, usually CSV file date_cols : list[str], optional - columns which contain dates and are parsed as such, + columns which contain dates and are parsed as such, by default ( 'VorgangsDatum', 'ErledigungsDatum', @@ -61,17 +61,22 @@ def load_raw_data( # load dataset date_cols = list(date_cols) data = pd.read_csv( - filepath_or_buffer=path, - sep=';', - encoding='cp1252', - parse_dates=date_cols, + filepath_or_buffer=path, + sep=';', + encoding='cp1252', + parse_dates=date_cols, dayfirst=True, ) - logger.info("Loaded dataset successfully.") - logger.info((f"Dataset properties: number of entries: {len(data)}, " - f"number of features {len(data.columns)}")) + logger.info('Loaded dataset successfully.') + logger.info( + ( + f'Dataset properties: number of entries: {len(data)}, ' + f'number of features {len(data.columns)}' + ) + ) return (data,) + def remove_duplicates( data: DataFrame, ) -> tuple[DataFrame]: @@ -89,7 +94,7 @@ def remove_duplicates( """ # obtain info about duplicates over all features duplicates_filt = data.duplicated() - logger.info(f"Number of duplicates over all features: {duplicates_filt.sum()}") + logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}') # drop duplicates wo_duplicates = data.drop_duplicates(ignore_index=True) duplicates_subset: list[str] = [ @@ -97,16 +102,26 @@ def remove_duplicates( 'ObjektID', ] duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset) - logger.info(("Number of duplicates over subset " - f">>{duplicates_subset}<<: {duplicates_subset_filt.sum()}")) - wo_duplicates =\ - wo_duplicates.drop_duplicates(subset=duplicates_subset, ignore_index=True).copy() - logger.info("Removed all duplicates from dataset successfully.") - logger.info((f"New Dataset properties: number of entries: {len(wo_duplicates)}, " - f"number of features {len(wo_duplicates.columns)}")) - + logger.info( + ( + 'Number of duplicates over subset ' + f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}' + ) + ) + wo_duplicates = wo_duplicates.drop_duplicates( + subset=duplicates_subset, ignore_index=True + ).copy() + logger.info('Removed all duplicates from dataset successfully.') + logger.info( + ( + f'New Dataset properties: number of entries: {len(wo_duplicates)}, ' + f'number of features {len(wo_duplicates.columns)}' + ) + ) + return (wo_duplicates,) + def remove_NA( data: DataFrame, target_features: list[str] = [ @@ -127,17 +142,18 @@ def remove_NA( DataFrame dataset with removed NA entries for given subset of features """ - wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore - logger.info(f"Removed NA entries for features >>{target_features}<< from dataset successfully.") - + wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore + logger.info( + f'Removed NA entries for features >>{target_features}<< from dataset successfully.' + ) + return (wo_NA,) + # ** (2) entry-based cleansing # following functions clean and prepare specific entries, not whole dataset -def clean_string_slim( - string: str -) -> str: - """mapping function to clean single string entries in a series (feature-wise) +def clean_string_slim(string: str) -> str: + """mapping function to clean single string entries in a series (feature-wise) of the dataset, used to be applied element-wise for string features Parameters @@ -151,13 +167,16 @@ def clean_string_slim( cleaned entry """ # remove special chars - pattern = r'[\t\n\r\f\v]' + pattern = r'[\t\n\r\f\v]+' string = re.sub(pattern, ' ', string) + pattern = r'([,;.:!?-_\+]){2,}' # remove whitespaces at the beginning and the end + string = re.sub(pattern, r'\1', string) string = string.strip() - + return string + def entry_wise_cleansing( data: DataFrame, target_feature: str, @@ -165,10 +184,16 @@ def entry_wise_cleansing( ) -> tuple[DataFrame]: # apply given cleansing function to target feature data[target_feature] = data[target_feature].map(cleansing_func) - logger.info((f"Successfully applied entry-wise cleansing procedure >>{cleansing_func.__name__}<< " - f"for feature >>{target_feature}<<")) + logger.info( + ( + f'Successfully applied entry-wise cleansing procedure ' + f'>>{cleansing_func.__name__}<< ' + f'for feature >>{target_feature}<<' + ) + ) return (data,) + # ** in-depth analysis of one feature # following functions try to gain insights on a given feature of the IHM dataset such # as number of occurrences or associated Object IDs @@ -178,15 +203,15 @@ def analyse_feature( ) -> tuple[DataFrame]: # feature columns feature_entries = data[target_feature] - logger.info(f"Number of entries for feature >>{target_feature}<<: {len(feature_entries)}") + logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}') # obtain unique entries unique_feature_entries = feature_entries.unique() - + # prepare result DataFrame cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids'] result_df = pd.DataFrame(columns=cols) - - for entry in tqdm(unique_feature_entries, mininterval=1.): + + for entry in tqdm(unique_feature_entries, mininterval=1.0): len_entry = len(entry) filt = data[target_feature] == entry temp = data[filt] @@ -194,19 +219,16 @@ def analyse_feature( assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable') num_assoc_obj_ids = len(assoc_obj_ids) num_dupl = filt.sum() - - conc_df = pd.DataFrame(data=[[ - entry, - len_entry, - num_dupl, - assoc_obj_ids, - num_assoc_obj_ids - ]], columns=cols) - + + conc_df = pd.DataFrame( + data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]], + columns=cols, + ) + result_df = pd.concat([result_df, conc_df], ignore_index=True) - + result_df = result_df.sort_values(by='num_occur', ascending=False).copy() - + return (result_df,) @@ -223,16 +245,16 @@ def build_embedding_map( embeddings: dict[int, tuple[Embedding, str]] = {} is_spacy = False is_STRF = False - + if isinstance(model, GermanSpacyModel): is_spacy = True elif isinstance(model, SentenceTransformer): is_STRF = True - + if not any((is_spacy, is_STRF)): - raise NotImplementedError("Model type unknown") - - for (idx, text) in tqdm(data.items(), total=len(data), mininterval=1.): + raise NotImplementedError('Model type unknown') + + for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0): # verbose code: Pyright not inferring types correctly idx = cast(int, idx) text = cast(str, text) @@ -246,12 +268,17 @@ def build_embedding_map( logger.debug(f'{embd.text=} has no vector') elif is_STRF: model = cast(SentenceTransformer, model) - embd = cast(Tensor, - model.encode(text, show_progress_bar=False)) + embd = cast(Tensor, model.encode(text, show_progress_bar=False)) embeddings[idx] = (embd, text) - + return embeddings, (is_spacy, is_STRF) + +# adapt interface +# use candidates by index function +# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix + + # build similarity matrix out of embeddings def build_cosSim_matrix( data: Series, @@ -259,30 +286,31 @@ def build_cosSim_matrix( ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]: # build empty matrix df_index = data.index - cosineSim_idx_matrix = pd.DataFrame(data=0., columns=df_index, - index=df_index, dtype=np.float32) - - logger.info("Start building embedding map...") - + cosineSim_idx_matrix = pd.DataFrame( + data=0.0, columns=df_index, index=df_index, dtype=np.float32 + ) + + logger.info('Start building embedding map...') + # obtain embeddings based on used model embds, (is_spacy, is_STRF) = build_embedding_map( data=data, model=model, ) - - logger.info("Embedding map built successfully.") - + + logger.info('Embedding map built successfully.') + # apply index based mapping for efficient handling of large texts combs = combinations(df_index, 2) - total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index)-2) - - logger.info("Start calculation of similarity scores...") - - for (idx1, idx2) in tqdm(combs, total=total_combs, mininterval=1.): - #print(f"{idx1=}, {idx2=}") + total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2) + + logger.info('Start calculation of similarity scores...') + + for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0): + # print(f"{idx1=}, {idx2=}") embd1 = embds[idx1][0] embd2 = embds[idx2][0] - + # calculate similarity based on model type if is_spacy: embd1 = cast(SpacyDoc, embds[idx1][0]) @@ -293,14 +321,15 @@ def build_cosSim_matrix( embd2 = cast(Tensor, embds[idx2][0]) cosSim = sentence_transformers.util.cos_sim(embd1, embd2) cosSim = cast(float, cosSim.item()) - + cosineSim_idx_matrix.at[idx1, idx2] = cosSim - - logger.info("Similarity scores calculated successfully.") - + + logger.info('Similarity scores calculated successfully.') + return cosineSim_idx_matrix, embds -# obtain index pairs with cosine similarity + +# obtain index pairs with cosine similarity # greater than or equal to given threshold value def filt_thresh_cosSim_matrix( cosineSim_idx_matrix: DataFrame, @@ -322,11 +351,13 @@ def filt_thresh_cosSim_matrix( Series series with multi index (index pairs) and corresponding similarity score """ - cosineSim_filt = cast(Series, - cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()) - + cosineSim_filt = cast( + Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack() + ) + return cosineSim_filt, embds + def list_cosSim_dupl_candidates( cosineSim_filt: Series, embds: dict[int, tuple[Embedding, str]], @@ -335,7 +366,7 @@ def list_cosSim_dupl_candidates( filename: str = 'CosSim-FilterCandidates', pipeline: BasePipeline | None = None, ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]: - """providing an overview of candidates with a similarity score greater than + """providing an overview of candidates with a similarity score greater than given threshold; more suitable for debugging purposes Returns @@ -346,22 +377,24 @@ def list_cosSim_dupl_candidates( list containing relevant index pairs for entries with similarity score greater than given threshold """ - logger.info("Start gathering of similarity candidates...") + logger.info('Start gathering of similarity candidates...') # compare found duplicates columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score'] df_candidates = pd.DataFrame(columns=columns) - + index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] - for ((idx1, idx2), score) in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore + for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore # get text content from embedding as second tuple entry - content = [[ - idx1, - embds[idx1][1], - idx2, - embds[idx2][1], - score, - ]] + content = [ + [ + idx1, + embds[idx1][1], + idx2, + embds[idx2][1], + score, + ] + ] # add candidates to collection DataFrame df_conc = pd.DataFrame(columns=columns, data=content) if df_candidates.empty: @@ -370,25 +403,28 @@ def list_cosSim_dupl_candidates( df_candidates = pd.concat([df_candidates, df_conc]) # save index pairs index_pairs.append((idx1, idx2)) - - logger.info("Similarity candidates gathered successfully.") - + + logger.info('Similarity candidates gathered successfully.') + if save_candidates: if saving_path is None: - raise ValueError(("Saving path must be provided if duplicate " - "candidates should be saved.")) + raise ValueError( + ('Saving path must be provided if duplicate ' 'candidates should be saved.') + ) elif pipeline is not None: - target_filename = (f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' - + filename + '.xlsx') + target_filename = ( + f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx' + ) elif pipeline is None: target_filename = f'{filename}.xlsx' - logger.info("Saving similarity candidates...") + logger.info('Saving similarity candidates...') target_path = saving_path.joinpath(target_filename) df_candidates.to_excel(target_path) - logger.info(f"Similarity candidates saved successfully to >>{target_path}<<.") - + logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.') + return index_pairs, embds + # TODO: change implementation fully to SentenceTransformer # usage of batch processing for embeddings, use candidate idx function # from time analysis --> moved to ``helpers.py`` @@ -419,24 +455,32 @@ def similar_ids_groups( yield list(id_group) """ + def merge_similarity_dupl( data: DataFrame, - similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]], + model: SentenceTransformer, + cos_sim_threshold: float, ) -> tuple[DataFrame]: - logger.info("Start merging of similarity candidates...") - + logger.info('Start merging of similarity candidates...') + # data merged_data = data.copy() + model_input = merged_data['entry'] + candidates_idx = candidates_by_index( + data_model_input=model_input, + model=model, + cos_sim_threshold=cos_sim_threshold, + ) # graph of similar ids - similar_id_graph, _ = similar_index_connection_graph(similar_idx_pairs) - + similar_id_graph, _ = similar_index_connection_graph(candidates_idx) + for similar_id_group in similar_index_groups(similar_id_graph): similar_id_group = list(similar_id_group) - similar_data = merged_data.loc[similar_id_group,:] - # keep first entry with max number occurrences, then number of + similar_data = merged_data.loc[similar_id_group, :] + # keep first entry with max number occurrences, then number of # associated objects, then length of entry similar_data = similar_data.sort_values( - by=['num_occur', 'num_assoc_obj_ids', 'len'], + by=['num_occur', 'num_assoc_obj_ids', 'len'], ascending=[False, False, False], ) # merge information to first entry @@ -453,11 +497,12 @@ def merge_similarity_dupl( # update entry in main dataset, drop remaining entries merged_data.update(merged_similar_data) merged_data = merged_data.drop(index=similar_id_group) - - logger.info("Similarity candidates merged successfully.") - + + logger.info('Similarity candidates merged successfully.') + return (merged_data.copy(),) + # merge duplicates def merge_similarity_dupl_old( data: DataFrame, @@ -466,15 +511,14 @@ def merge_similarity_dupl_old( # copy pre-cleaned data temp = data.copy() index = temp.index - #logger.info("Start merging of similarity candidates...") - + # logger.info("Start merging of similarity candidates...") + # iterate over index pairs - for (i1, i2) in tqdm(dupl_idx_pairs): - + for i1, i2 in tqdm(dupl_idx_pairs): # if an entry does not exist any more, skip this pair if i1 not in index or i2 not in index: continue - + # merge num occur num_occur1 = temp.at[i1, 'num_occur'] num_occur2 = temp.at[i2, 'num_occur'] @@ -493,13 +537,13 @@ def merge_similarity_dupl_old( temp.at[i1, 'num_occur'] = new_num_occur temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids - + # drop second entry temp = temp.drop(index=i2) index = temp.index - - #logger.info("Similarity candidates merged successfully.") - + + # logger.info("Similarity candidates merged successfully.") + return (temp,) @@ -508,7 +552,7 @@ def choose_cosSim_dupl_candidates( cosineSim_filt: Series, embds: dict[int, tuple[Embedding, str]], ) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]: - """providing an overview of candidates with a similarity score greater than + """providing an overview of candidates with a similarity score greater than given threshold, but decision is made manually by iterating through the candidates with user interaction; more suitable for debugging purposes @@ -520,15 +564,14 @@ def choose_cosSim_dupl_candidates( list containing relevant index pairs for entries with similarity score greater than given threshold """ - - + # compare found duplicates columns = ['idx1', 'text1', 'idx2', 'text2', 'score'] df_candidates = pd.DataFrame(columns=columns) - + index_pairs: list[tuple[PandasIndex, PandasIndex]] = [] - for ((idx1, idx2), score) in cosineSim_filt.items(): # type: ignore + for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore # get texts for comparison text1 = embds[idx1][1] text2 = embds[idx2][1] @@ -537,21 +580,23 @@ def choose_cosSim_dupl_candidates( print('text1:\n', text1, '\n', flush=True) print('text2:\n', text2, '\n', flush=True) decision = input('Please enter >>y<< if this is a duplicate, else hit enter:') - + if not decision == 'y': continue - + # get text content from embedding as second tuple entry - content = [[ - idx1, - text1, - idx2, - text2, - score, - ]] + content = [ + [ + idx1, + text1, + idx2, + text2, + score, + ] + ] df_conc = pd.DataFrame(columns=columns, data=content) - + df_candidates = pd.concat([df_candidates, df_conc]) index_pairs.append((idx1, idx2)) - - return df_candidates, index_pairs \ No newline at end of file + + return df_candidates, index_pairs diff --git a/src/lang_main/analysis/shared.py b/src/lang_main/analysis/shared.py index 9165e96..277675b 100644 --- a/src/lang_main/analysis/shared.py +++ b/src/lang_main/analysis/shared.py @@ -1,11 +1,71 @@ -from typing import cast from collections.abc import Iterable, Iterator +from typing import cast import networkx as nx +import numpy as np +import numpy.typing as npt +import sentence_transformers +import sentence_transformers.util from networkx import Graph +from pandas import Series +from sentence_transformers import SentenceTransformer +from torch import Tensor +from tqdm.auto import tqdm +from lang_main.analysis.graphs import get_graph_metadata, update_graph from lang_main.types import PandasIndex -from lang_main.analysis.graphs import update_graph, get_graph_metadata + + +def candidates_by_index( + data_model_input: Series, + model: SentenceTransformer, + cos_sim_threshold: float = 0.5, + # ) -> Iterator[tuple[PandasIndex, PandasIndex]]: +) -> Iterator[tuple[PandasIndex, PandasIndex]]: + """function to filter candidate indices based on cosine similarity + using SentenceTransformer model in batch mode, + feed data as Series to retain information about indices of entries and + access them later in the original dataset + + Parameters + ---------- + obj_id : ObjectID + _description_ + data_model_input : Series + containing indices and text entries to process + model : SentenceTransformer + necessary SentenceTransformer model to encode text entries + cos_sim_threshold : float, optional + threshold for cosine similarity to filter candidates, by default 0.5 + + Yields + ------ + Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]] + ObjectID and tuple of index pairs which meet the cosine + similarity threshold + """ + # embeddings + batch = cast(list[str], data_model_input.to_list()) + embds = cast( + Tensor, + model.encode( + batch, + convert_to_numpy=False, + convert_to_tensor=True, + show_progress_bar=False, + ), + ) + # cosine similarity + cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy()) + np.fill_diagonal(cos_sim, 0.0) + cos_sim = np.triu(cos_sim) + cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold) + + for idx_array in cos_sim_idx: + idx_pair = cast( + tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array) + ) + yield idx_pair def similar_index_connection_graph( @@ -15,21 +75,21 @@ def similar_index_connection_graph( # use this graph to get connected components (indices which belong together) # retain semantic connection on whole dataset similar_id_graph = nx.Graph() - for (idx1, idx2) in similar_idx_pairs: - # inplace operation, parent/child do not really exist in undirected graph - update_graph(graph=similar_id_graph, parent=idx1, child=idx2) - + # for idx1, idx2 in similar_idx_pairs: + # # inplace operation, parent/child do not really exist in undirected graph + # update_graph(graph=similar_id_graph, parent=idx1, child=idx2) + update_graph(graph=similar_id_graph, batch=similar_idx_pairs) + graph_info = get_graph_metadata(graph=similar_id_graph, logging=False) - + return similar_id_graph, graph_info -# TODO check returning tuple + def similar_index_groups( similar_id_graph: Graph, ) -> Iterator[tuple[PandasIndex, ...]]: # groups of connected indices - ids_groups = cast(Iterator[set[PandasIndex]], - nx.connected_components(G=similar_id_graph)) - + ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph)) + for id_group in ids_groups: - yield tuple(id_group) \ No newline at end of file + yield tuple(id_group) diff --git a/src/lang_main/analysis/timeline.py b/src/lang_main/analysis/timeline.py index 3f67bb4..9d90c7c 100644 --- a/src/lang_main/analysis/timeline.py +++ b/src/lang_main/analysis/timeline.py @@ -1,21 +1,17 @@ -from typing import cast from collections.abc import Iterable, Iterator +from typing import cast -import numpy as np -import numpy.typing as npt from pandas import DataFrame, Series -from torch import Tensor from sentence_transformers import SentenceTransformer -import sentence_transformers -import sentence_transformers.util -from tqdm.auto import tqdm # TODO: check deletion +from tqdm.auto import tqdm # TODO: check deletion -from lang_main.types import PandasIndex, ObjectID, TimelineCandidates -from lang_main.loggers import logger_timeline as logger from lang_main.analysis.shared import ( + candidates_by_index, similar_index_connection_graph, similar_index_groups, ) +from lang_main.loggers import logger_timeline as logger +from lang_main.types import ObjectID, PandasIndex, TimelineCandidates def non_relevant_obj_ids( @@ -25,35 +21,36 @@ def non_relevant_obj_ids( feature_uniqueness: str = 'HObjektText', feature_obj_id: str = 'ObjektID', ) -> tuple[ObjectID, ...]: - data = data.copy() ids_to_ignore: set[ObjectID] = set() - obj_ids = cast(Iterable[ObjectID], # actually NumPy array - data[feature_obj_id].unique()) + obj_ids = cast( + Iterable[ObjectID], # actually NumPy array + data[feature_obj_id].unique(), + ) for obj_id in obj_ids: feats_per_obj_id = cast( - Series, - data.loc[(data[feature_obj_id]==obj_id), feature_uniqueness] + Series, data.loc[(data[feature_obj_id] == obj_id), feature_uniqueness] ) # check for uniqueness of given feature for current ObjectID # ignore NaN values feats_per_obj_id = feats_per_obj_id.dropna() unique_feats_per_obj_id = len(feats_per_obj_id.unique()) - + if unique_feats_per_obj_id > thresh_unique_feat_per_id: ids_to_ignore.add(obj_id) - + return tuple(ids_to_ignore) + def remove_non_relevant_obj_ids( data: DataFrame, thresh_unique_feat_per_id: int, *, feature_uniqueness: str = 'HObjektText', feature_obj_id: str = 'ObjektID', -) -> DataFrame: - logger.info("Removing non-relevant ObjectIDs from dataset") +) -> tuple[DataFrame]: + logger.info('Removing non-relevant ObjectIDs from dataset') data = data.copy() ids_to_ignore = non_relevant_obj_ids( data=data, @@ -63,41 +60,11 @@ def remove_non_relevant_obj_ids( ) # only retain entries with ObjectIDs not in IDs to ignore data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] - logger.debug(f"Ignored ObjectIDs: {ids_to_ignore}") - logger.info("Non-relevant ObjectIDs removed successfully") - - return data + logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}') + logger.info('Non-relevant ObjectIDs removed successfully') + + return (data,) -def filter_activities_per_obj_id( - data: DataFrame, - activity_feature: str = 'VorgangsTypName', - relevant_activity_types: Iterable[str] = ( - 'Reparaturauftrag (Portal)', - ), - feature_obj_id: str = 'ObjektID', - threshold_num_activities: int = 1, -) -> tuple[DataFrame, Series]: - data = data.copy() - # filter only relevant activities count occurrences for each ObjectID - logger.info("Filtering activities per ObjectID") - filt_rel_activities = data[activity_feature].isin(relevant_activity_types) - data_filter_activities = data.loc[filt_rel_activities].copy() - num_activities_per_obj_id = cast( - Series, - data_filter_activities[feature_obj_id].value_counts(sort=True) - ) - # filter for ObjectIDs with more than given number of activities - filt_below_thresh = (num_activities_per_obj_id <= threshold_num_activities) - # index of series contains ObjectIDs - obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index - filt_entries_below_thresh = (data_filter_activities[feature_obj_id] - .isin(obj_ids_below_thresh)) - - num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh] - data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh] - logger.info("Activities per ObjectID filtered successfully") - - return data_filter_activities, num_activities_per_obj_id def generate_model_input( data: DataFrame, @@ -107,8 +74,8 @@ def generate_model_input( 'VorgangsArtText', 'VorgangsBeschreibung', ), -) -> DataFrame: - logger.info("Generating concatenation of model input features") +) -> tuple[DataFrame]: + logger.info('Generating concatenation of model input features') data = data.copy() model_input_features = list(model_input_features) input_features = data[model_input_features].fillna('').astype(str) @@ -116,9 +83,40 @@ def generate_model_input( lambda x: ' - '.join(x), axis=1, ) - logger.info("Model input generated successfully") - - return data + logger.info('Model input generated successfully') + + return (data,) + + +def filter_activities_per_obj_id( + data: DataFrame, + activity_feature: str = 'VorgangsTypName', + relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',), + feature_obj_id: str = 'ObjektID', + threshold_num_activities: int = 1, +) -> tuple[DataFrame, Series]: + data = data.copy() + # filter only relevant activities count occurrences for each ObjectID + logger.info('Filtering activities per ObjectID') + filt_rel_activities = data[activity_feature].isin(relevant_activity_types) + data_filter_activities = data.loc[filt_rel_activities].copy() + num_activities_per_obj_id = cast( + Series, data_filter_activities[feature_obj_id].value_counts(sort=True) + ) + # filter for ObjectIDs with more than given number of activities + filt_below_thresh = num_activities_per_obj_id <= threshold_num_activities + # index of series contains ObjectIDs + obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index + filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin( + obj_ids_below_thresh + ) + + num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh] + data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh] + logger.info('Activities per ObjectID filtered successfully') + + return data_filter_activities, num_activities_per_obj_id + # for each obj_id in relevant_obj_ids ## filter data for obj_id @@ -130,6 +128,7 @@ def generate_model_input( ## obtain idx pairs, yield ## use idx pairs to get idx values of series + def get_timeline_candidates_index( data: DataFrame, num_activities_per_obj_id: Series, @@ -140,92 +139,33 @@ def get_timeline_candidates_index( model_input_feature: str = 'nlp_model_input', ) -> Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]]: # already sorted ObjIDs (descending regarding number of activities) - obj_ids = cast(Iterable[ObjectID], - num_activities_per_obj_id.index) - + obj_ids = cast(Iterable[ObjectID], num_activities_per_obj_id.index) + for obj_id in tqdm(obj_ids): - data_per_obj_id = cast( - DataFrame, - data.loc[data[feature_obj_id]==obj_id] - ) + data_per_obj_id = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id]) data_model_input = data_per_obj_id[model_input_feature] - + candidates_idx = candidates_by_index( data_model_input=data_model_input, model=model, cos_sim_threshold=cos_sim_threshold, ) # directly process candidates - candidates_idx = tuple(candidates_idx) + # candidates_idx = tuple(candidates_idx) similar_id_graph, _ = similar_index_connection_graph( similar_idx_pairs=candidates_idx, ) - + for index_group in similar_index_groups(similar_id_graph): yield obj_id, index_group + # TODO: check application for duplicate removal -def candidates_by_index( - data_model_input: Series, - model: SentenceTransformer, - cos_sim_threshold: float = 0.5, -) -> Iterator[tuple[PandasIndex, PandasIndex]]: - """function to filter candidate indices based on cosine similarity - using SentenceTransformer model in batch mode, - feed data as Series to retain information about indices of entries and - access them later in the original dataset - - Parameters - ---------- - obj_id : ObjectID - _description_ - data_model_input : Series - containing indices and text entries to process - model : SentenceTransformer - necessary SentenceTransformer model to encode text entries - cos_sim_threshold : float, optional - threshold for cosine similarity to filter candidates, by default 0.5 - - Yields - ------ - Iterator[tuple[ObjectID, tuple[PandasIndex, PandasIndex]]] - ObjectID and tuple of index pairs which meet the cosine - similarity threshold - """ - # embeddings - batch = cast(list[str], - data_model_input.to_list()) - embds = cast( - Tensor, - model.encode( - batch, - convert_to_numpy=False, - convert_to_tensor=True, - show_progress_bar=False, - ) - ) - # cosine similarity - cos_sim = cast( - npt.NDArray, - sentence_transformers.util.cos_sim(embds, embds).numpy() - ) - np.fill_diagonal(cos_sim, 0.) - cos_sim = np.triu(cos_sim) - cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold) - - for idx_array in cos_sim_idx: - idx_pair = cast( - tuple[np.int64, np.int64], - tuple(data_model_input.index[idx] for idx in idx_array) - ) - yield idx_pair - - def transform_timeline_candidates( candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]], ) -> TimelineCandidates: """function to build a mapping of ObjectIDs to their respective collection of - timeline candidates (as tuple), each candidate group is separated as distinct + timeline candidates (as tuple), each candidate group is separated as distinct tuple within this outer tuple Parameters @@ -238,12 +178,12 @@ def transform_timeline_candidates( dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]] dictionary: ObjectID -> tuple of candidate groups """ - + candidates_by_obj_id: TimelineCandidates = {} - + obj_id_target: ObjectID | None = None collection: list[tuple[PandasIndex, ...]] = [] - + for obj_id, cands in candidates: if obj_id_target is None: collection = [] @@ -253,26 +193,58 @@ def transform_timeline_candidates( collection = [] obj_id_target = obj_id collection.append(cands) - + if collection and obj_id_target is not None: candidates_by_obj_id[obj_id_target] = tuple(collection) - + return candidates_by_obj_id -def map_obj_texts( + +def map_obj_id_to_texts( data: DataFrame, - obj_ids: Iterable[ObjectID], + feature_obj_id: str = 'ObjektID', ) -> dict[ObjectID, str]: + data = data.copy() + obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique()) + obj_id_to_text: dict[ObjectID, str] = {} - - for obj_id in obj_ids: - data_per_obj = cast( - DataFrame, - data.loc[data['ObjektID']==obj_id] - ) + + for obj_id in tqdm(obj_ids): + data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id]) # just take first entry obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0]) obj_text = obj_text.strip(r' ,.:') obj_id_to_text[obj_id] = obj_text - - return obj_id_to_text \ No newline at end of file + + return obj_id_to_text + + +def get_timeline_candidates( + data: DataFrame, + num_activities_per_obj_id: Series, + *, + model: SentenceTransformer, + cos_sim_threshold: float, + feature_obj_id: str = 'ObjektID', + model_input_feature: str = 'nlp_model_input', +) -> tuple[TimelineCandidates, dict[ObjectID, str]]: + logger.info('Obtaining timeline candidates...') + candidates = get_timeline_candidates_index( + data=data, + num_activities_per_obj_id=num_activities_per_obj_id, + model=model, + cos_sim_threshold=cos_sim_threshold, + feature_obj_id=feature_obj_id, + model_input_feature=model_input_feature, + ) + tl_candidates = transform_timeline_candidates(candidates) + logger.info('Timeline candidates obtained successfully.') + # text mapping to obtain object descriptors + logger.info('Mapping ObjectIDs to their respective text descriptor...') + map_obj_text = map_obj_id_to_texts( + data=data, + feature_obj_id=feature_obj_id, + ) + logger.info('ObjectIDs successfully mapped to text descriptors.') + + return tl_candidates, map_obj_text diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py index 02c05e9..cf4efb2 100644 --- a/src/lang_main/analysis/tokens.py +++ b/src/lang_main/analysis/tokens.py @@ -1,56 +1,56 @@ -from typing import cast import re -from itertools import combinations from collections.abc import Iterator +from itertools import combinations +from typing import cast from dateutil.parser import parse -from spacy.tokens.token import Token as SpacyToken -from spacy.tokens.doc import Doc as SpacyDoc -from spacy.lang.de import German as GermanSpacyModel from pandas import DataFrame +from spacy.lang.de import German as GermanSpacyModel +from spacy.tokens.doc import Doc as SpacyDoc +from spacy.tokens.token import Token as SpacyToken from tqdm.auto import tqdm -from lang_main.loggers import logger_token_analysis as logger from lang_main.analysis.graphs import ( - update_graph, TokenGraph, + update_graph, ) - +from lang_main.loggers import logger_token_analysis as logger # ** Logging -#LOGGING_LEVEL = 'INFO' -#logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout) -#logger = logging.getLogger('ihm_analyse.token_analysis') +# LOGGING_LEVEL = 'INFO' +# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout) +# logger = logging.getLogger('ihm_analyse.token_analysis') # ** POS -#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) -#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) -#POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN']) +# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) +# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) +# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN']) POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX']) -#POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB']) +# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB']) POS_INDIRECT: frozenset[str] = frozenset(['AUX']) # ** TAG -#TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD']) +# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD']) TAG_OF_INTEREST: frozenset[str] = frozenset() # ** obtaining connection in texts + def pre_clean_word(string: str) -> str: - pattern = r'[^A-Za-zäöüÄÖÜ]+' string = re.sub(pattern, '', string) - + return string -# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format + +# https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format def is_str_date( - string: str, + string: str, fuzzy: bool = False, ) -> bool: - #print(string) + # print(string) try: # check if string is a number # if length is greater than 8, it is not a date @@ -60,33 +60,38 @@ def is_str_date( except ValueError: # not a number pass - + try: parse(string, fuzzy=fuzzy) return True except ValueError: return False + def obtain_relevant_descendants( - token: SpacyToken, + token: SpacyToken, ) -> Iterator[SpacyToken]: - for descendant in token.subtree: # subtrees contain the token itself # if current element is token skip this element if descendant == token: continue - + # if descendant is a date skip it) if is_str_date(string=descendant.text): continue - - logger.debug((f"Token >>{token}<<, POS >>{token.pos_}<< | descendant " - f">>{descendant}<<, POS >>{descendant.pos_}<<")) - + + logger.debug( + ( + f'Token >>{token}<<, POS >>{token.pos_}<< | descendant ' + f'>>{descendant}<<, POS >>{descendant.pos_}<<' + ) + ) + # eliminate cases of cross-references with verbs - if ((token.pos_ == 'AUX' or token.pos_ == 'VERB') and - (descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB')): + if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and ( + descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB' + ): continue # skip cases in which descendant is indirect POS with others than verbs elif descendant.pos_ in POS_INDIRECT: @@ -94,11 +99,12 @@ def obtain_relevant_descendants( # skip cases in which child has no relevant POS or TAG elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST): continue - + yield descendant - + # TODO look at results and fine-tune function accordingly + def add_doc_info_to_graph( graph: TokenGraph, doc: SpacyDoc, @@ -114,7 +120,7 @@ def add_doc_info_to_graph( # skip token which are dates or times if is_str_date(string=token.text): continue - + relevant_descendants = obtain_relevant_descendants(token=token) # for non-AUX: add parent <--> descendant pair to graph if token.pos_ not in POS_INDIRECT: @@ -124,13 +130,13 @@ def add_doc_info_to_graph( graph=graph, parent=token.lemma_, child=descendant.lemma_, - weight_connection=weight + weight_connection=weight, ) else: # if indirect POS, make connection between all associated words combs = combinations(relevant_descendants, r=2) for comb in combs: - # !! parents and children do not really exist in this case, + # !! parents and children do not really exist in this case, # !! but only one connection is made update_graph( graph=graph, @@ -139,32 +145,33 @@ def add_doc_info_to_graph( weight_connection=weight, ) + def build_token_graph( data: DataFrame, model: GermanSpacyModel, ) -> tuple[TokenGraph]: # empty NetworkX directed graph - #graph = nx.DiGraph() + # graph = nx.DiGraph() graph = TokenGraph() - + for row in tqdm(data.itertuples(), total=len(data)): # obtain properties from tuple # attribute names must match with preprocessed data entry_text = cast(str, row.entry) weight = cast(int, row.num_occur) - + # get spacy model output doc = model(entry_text) - + add_doc_info_to_graph( graph=graph, doc=doc, weight=weight, ) - + # metadata graph.update_metadata() # convert to undirected graph.to_undirected() - - return (graph,) \ No newline at end of file + + return (graph,) diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py new file mode 100644 index 0000000..c60439f --- /dev/null +++ b/src/lang_main/constants.py @@ -0,0 +1,55 @@ +from pathlib import Path +from typing import Final + +from lang_main import CONFIG + +# ** paths +INPUT_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['inputs']) +SAVE_PATH_FOLDER: Final[Path] = Path(CONFIG['paths']['results']) +PATH_TO_DATASET: Final[Path] = Path(CONFIG['paths']['dataset']) +# ** control +DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing'] +SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip'] +DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis'] +SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip'] +DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing'] +SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip'] +DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis'] +SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip'] +# ** export + +# ** preprocessing +FILENAME_COSSIM_FILTER_CANDIDATES: Final[str] = CONFIG['preprocess'][ + 'filename_cossim_filter_candidates' +] +DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols'] +THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][ + 'threshold_amount_characters' +] +THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'] +# ** token analysis + +# ** graph postprocessing +THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight'] +# ** time analysis.uniqueness +THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][ + 'threshold_unique_texts' +] +UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][ + 'criterion_feature' +] +FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id'] +# ** time_analysis.model_input +MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple( + CONFIG['time_analysis']['model_input']['input_features'] +) +ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature'] +ACTIVITY_TYPES: Final[tuple[str]] = tuple( + CONFIG['time_analysis']['model_input']['activity_types'] +) +THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][ + 'threshold_num_acitivities' +] +THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][ + 'threshold_similarity' +] diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml new file mode 100644 index 0000000..c694e25 --- /dev/null +++ b/src/lang_main/lang_main_config.toml @@ -0,0 +1,56 @@ +# lang_main: Config file + +[paths] +inputs = './inputs/' +results = './results/test_new2/' +dataset = './01_2_Rohdaten_neu/Export4.csv' +#results = './results/Export7/' +#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' +#results = './results/Export7_trunc/' +#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' + +[control] +preprocessing = true +preprocessing_skip = false +token_analysis = false +token_analysis_skip = false +graph_postprocessing = false +graph_postprocessing_skip = false +time_analysis = false +time_analysis_skip = false + +#[export_filenames] +#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' + +[preprocess] +filename_cossim_filter_candidates = 'CosSim-FilterCandidates' +date_cols = [ + "VorgangsDatum", + "ErledigungsDatum", + "Arbeitsbeginn", + "ErstellungsDatum", +] +threshold_amount_characters = 5 +threshold_similarity = 0.8 + +[graph_postprocessing] +threshold_edge_weight = 150 + +[time_analysis.uniqueness] +threshold_unique_texts = 4 +criterion_feature = 'HObjektText' +feature_name_obj_id = 'ObjektID' + +[time_analysis.model_input] +input_features = [ + 'VorgangsTypName', + 'VorgangsArtText', + 'VorgangsBeschreibung', +] +activity_feature = 'VorgangsTypName' +activity_types = [ + 'Reparaturauftrag (Portal)', + 'Störungsmeldung', +] +threshold_num_acitivities = 1 +threshold_similarity = 0.8 \ No newline at end of file diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py index f33302e..eadbb4d 100644 --- a/src/lang_main/loggers.py +++ b/src/lang_main/loggers.py @@ -1,5 +1,5 @@ -from typing import Final import logging +from typing import Final from lang_main.types import LoggingLevels diff --git a/src/lang_main/pipelines/base.py b/src/lang_main/pipelines/base.py index 8273c4b..ad78589 100644 --- a/src/lang_main/pipelines/base.py +++ b/src/lang_main/pipelines/base.py @@ -1,20 +1,18 @@ -from typing import Any -#from types import FunctionType -import sys -import logging from collections.abc import Callable from pathlib import Path +from typing import Any from lang_main.loggers import logger_pipelines as logger -from lang_main.shared import save_pickle, load_pickle +from lang_main.shared import load_pickle, save_pickle # ** pipelines to perform given actions on dataset in a customisable manner + class NoPerformableActionError(Exception): """Error describing that no action is available in the current pipeline""" -class BasePipeline(): - + +class BasePipeline: def __init__( self, name: str, @@ -22,12 +20,14 @@ class BasePipeline(): ) -> None: # init base class super().__init__() - + # name of pipeline self.name = name # working directory for pipeline == output path self.working_dir = working_dir - + # if not self.working_dir.exists(): + # self.working_dir.mkdir(parents=True) + # container for actions to perform during pass self.actions: list[Callable] = [] self.action_names: list[str] = [] @@ -37,15 +37,17 @@ class BasePipeline(): self.curr_proc_idx: int = 1 # intermediate result self._intermediate_result: Any | None = None - + def __repr__(self) -> str: - return (f"{self.__class__.__name__}(name: {self.name}, " - f"working dir: {self.working_dir}, contents: {self.action_names})") - + return ( + f'{self.__class__.__name__}(name: {self.name}, ' + f'working dir: {self.working_dir}, contents: {self.action_names})' + ) + @property def intermediate_result(self) -> Any: return self._intermediate_result - + def add( self, action: Callable, @@ -53,16 +55,17 @@ class BasePipeline(): save_result: bool = False, ) -> None: # check explicitly for function type - #if isinstance(action, FunctionType): + # if isinstance(action, FunctionType): if isinstance(action, Callable): self.actions.append(action) self.action_names.append(action.__name__) self.actions_kwargs.append(action_kwargs.copy()) self.is_save_result.append(save_result) else: - raise TypeError(("Action must be custom function, " - f"but is of type >>{type(action)}<<.")) - + raise TypeError( + f'Action must be custom function, but is of type >>{type(action)}<<.' + ) + # TODO: add multiple entries by utilising simple add method """ def add_multi( @@ -84,7 +87,7 @@ class BasePipeline(): raise TypeError(("Action must be function or sequence of functions, " f"but is of type >>{type(action)}<<.")) """ - + def save_curr_result( self, filename: str, @@ -94,7 +97,7 @@ class BasePipeline(): target_path = target_path.with_suffix('.pkl') # saving file locally save_pickle(obj=self._intermediate_result, path=target_path) - + def load_intermediate_result( self, saving_path: str, @@ -103,25 +106,26 @@ class BasePipeline(): target_path = Path(saving_path + filename).with_suffix('.pkl') # loading DataFrame or Series from pickle data = load_pickle(target_path) - + return data - + def prep_run(self) -> None: - logger.info(f"Starting processing pipeline >>{self.name}<<...") + logger.info(f'Starting processing pipeline >>{self.name}<<...') # progress tracking self.curr_proc_idx = 1 # check if performable actions available if len(self.actions) == 0: - raise NoPerformableActionError(("The pipeline does not contain any " - "performable actions.")) - + raise NoPerformableActionError( + ('The pipeline does not contain any ' 'performable actions.') + ) + def run( self, starting_values: tuple[Any, ...], ) -> tuple[Any, ...]: # prepare start self.prep_run() - + for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)): if idx == 0: ret = action(*starting_values, **action_kwargs) @@ -134,7 +138,7 @@ class BasePipeline(): self.save_curr_result(filename=self.action_names[idx]) # processing tracking self.curr_proc_idx += 1 - - logger.info(f"Processing pipeline >>{self.name}<< successfully ended.") - - return ret \ No newline at end of file + + logger.info(f'Processing pipeline >>{self.name}<< successfully ended.') + + return ret diff --git a/src/lang_main/pipelines/predefined.py b/src/lang_main/pipelines/predefined.py index e440646..ea168dc 100644 --- a/src/lang_main/pipelines/predefined.py +++ b/src/lang_main/pipelines/predefined.py @@ -1,57 +1,144 @@ -from sentence_transformers import SentenceTransformer import spacy +from sentence_transformers import SentenceTransformer -from lang_main import ( - SAVE_PATH_FOLDER, - DATE_COLS, - FILENAME_COSSIM_FILTER_CANDIDATES, - THRESHOLD_SIMILARITY, -) -from lang_main.pipelines.base import BasePipeline from lang_main.analysis.preprocessing import ( - load_raw_data, - remove_duplicates, - remove_NA, + analyse_feature, clean_string_slim, entry_wise_cleansing, - analyse_feature, - build_cosSim_matrix, - filt_thresh_cosSim_matrix, - list_cosSim_dupl_candidates, + load_raw_data, merge_similarity_dupl, + remove_duplicates, + remove_NA, +) +from lang_main.analysis.timeline import ( + filter_activities_per_obj_id, + generate_model_input, + get_timeline_candidates, + remove_non_relevant_obj_ids, ) from lang_main.analysis.tokens import build_token_graph +from lang_main.constants import ( + ACTIVITY_FEATURE, + ACTIVITY_TYPES, + DATE_COLS, + FEATURE_NAME_OBJ_ID, + MODEL_INPUT_FEATURES, + SAVE_PATH_FOLDER, + THRESHOLD_NUM_ACTIVITIES, + THRESHOLD_SIMILARITY, + THRESHOLD_TIMELINE_SIMILARITY, + THRESHOLD_UNIQUE_TEXTS, + UNIQUE_CRITERION_FEATURE, +) +from lang_main.pipelines.base import BasePipeline # ** pipeline configuration # ** target feature preparation pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER) -pipe_target_feat.add(load_raw_data, {'date_cols': DATE_COLS}) +pipe_target_feat.add( + load_raw_data, + { + 'date_cols': DATE_COLS, + }, +) pipe_target_feat.add(remove_duplicates) pipe_target_feat.add(remove_NA, save_result=True) -pipe_target_feat.add(entry_wise_cleansing, {'target_feature': 'VorgangsBeschreibung', 'cleansing_func': clean_string_slim}) -pipe_target_feat.add(analyse_feature, {'target_feature': 'VorgangsBeschreibung'}, save_result=True) +pipe_target_feat.add( + entry_wise_cleansing, + { + 'target_feature': 'VorgangsBeschreibung', + 'cleansing_func': clean_string_slim, + }, +) +pipe_target_feat.add( + analyse_feature, + { + 'target_feature': 'VorgangsBeschreibung', + }, + save_result=True, +) # output: DataFrame containing target feature with # number of occurrences and associated ObjectIDs # ** embedding pipe +# ?? still needed? # using similarity between entries to catch duplicates with typo or similar content -pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER) +# pipe_embds = BasePipeline(name='Embedding1', working_dir=SAVE_PATH_FOLDER) model_spacy = spacy.load('de_dep_news_trf') model_stfr = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') -pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True) -pipe_embds.add(filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True) -pipe_embds.add( - list_cosSim_dupl_candidates, - {'save_candidates': True, - 'saving_path': SAVE_PATH_FOLDER, - 'filename': FILENAME_COSSIM_FILTER_CANDIDATES, - 'pipeline': pipe_embds}, save_result=True) +# pipe_embds.add(build_cosSim_matrix, {'model': model_stfr}, save_result=True) +# pipe_embds.add( +# filt_thresh_cosSim_matrix, {'threshold': THRESHOLD_SIMILARITY}, save_result=True +# ) +# pipe_embds.add( +# list_cosSim_dupl_candidates, +# { +# 'save_candidates': True, +# 'saving_path': SAVE_PATH_FOLDER, +# 'filename': FILENAME_COSSIM_FILTER_CANDIDATES, +# 'pipeline': pipe_embds, +# }, +# save_result=True, +# ) # ** Merge duplicates pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER) -pipe_merge.add(merge_similarity_dupl, save_result=True) +# pipe_merge.add(merge_similarity_dupl, save_result=True) +pipe_merge.add( + merge_similarity_dupl, + { + 'model': model_stfr, + 'cos_sim_threshold': THRESHOLD_SIMILARITY, + }, + save_result=True, +) # ** token analysis pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER) -pipe_token_analysis.add(build_token_graph, {'model': model_spacy}, save_result=True) +pipe_token_analysis.add( + build_token_graph, + { + 'model': model_spacy, + }, + save_result=True, +) + + +# ** timeline analysis +pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER) +pipe_timeline.add( + remove_non_relevant_obj_ids, + { + 'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS, + 'feature_uniqueness': UNIQUE_CRITERION_FEATURE, + 'feature_obj_id': FEATURE_NAME_OBJ_ID, + }, + save_result=True, +) +pipe_timeline.add( + generate_model_input, + { + 'target_feature_name': 'nlp_model_input', + 'model_input_features': MODEL_INPUT_FEATURES, + }, +) +pipe_timeline.add( + filter_activities_per_obj_id, + { + 'activity_feature': ACTIVITY_FEATURE, + 'relevant_activity_types': ACTIVITY_TYPES, + 'feature_obj_id': FEATURE_NAME_OBJ_ID, + 'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES, + }, +) +pipe_timeline.add( + get_timeline_candidates, + { + 'model': model_stfr, + 'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY, + 'feature_obj_id': FEATURE_NAME_OBJ_ID, + 'model_input_feature': 'nlp_model_input', + }, + save_result=True, +) diff --git a/src/lang_main/shared.py b/src/lang_main/shared.py index e54286b..e44139f 100644 --- a/src/lang_main/shared.py +++ b/src/lang_main/shared.py @@ -1,56 +1,67 @@ -from typing import Any import os -import shutil import pickle +import shutil import tomllib from pathlib import Path +from typing import Any from lang_main.loggers import logger_shared_helpers as logger + # ** Lib def create_saving_folder( saving_path_folder: str | Path, overwrite_existing: bool = False, ) -> None: # check for existence of given path - if not os.path.exists(saving_path_folder): - os.makedirs(saving_path_folder) + if isinstance(saving_path_folder, str): + saving_path_folder = Path(saving_path_folder) + if not saving_path_folder.exists(): + saving_path_folder.mkdir(parents=True) else: if overwrite_existing: # overwrite if desired (deletes whole path and re-creates it) shutil.rmtree(saving_path_folder) os.makedirs(saving_path_folder) else: - logger.info((f"Path >>{saving_path_folder}<< already exists and remained " - "unchanged. If you want to overwrite this path, use parameter " - ">>overwrite_existing<<.")) + logger.info( + ( + f'Path >>{saving_path_folder}<< already exists and remained ' + f'unchanged. If you want to overwrite this path, use parameter ' + f'>>overwrite_existing<<.' + ) + ) + def load_toml_config( path_to_toml: str | Path, ) -> dict[str, Any]: - with open(path_to_toml, "rb") as f: + with open(path_to_toml, 'rb') as f: data = tomllib.load(f) - logger.info("Loaded TOML config file successfully.") + logger.info('Loaded TOML config file successfully.') return data + # saving and loading using pickle # careful: pickling from unknown sources can be dangerous def save_pickle( - obj: Any, + obj: Any, path: str | Path, ) -> None: with open(path, 'wb') as file: pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) - logger.info(f"Saved file successfully under {path}") + logger.info(f'Saved file successfully under {path}') + def load_pickle( path: str | Path, ) -> Any: with open(path, 'rb') as file: obj = pickle.load(file) - logger.info("Loaded file successfully.") + logger.info('Loaded file successfully.') return obj + # TODO: remove, too specialised for common application """ def filter_candidates_idx( @@ -103,4 +114,4 @@ def filter_candidates_idx( tuple(data_model_input.index[idx] for idx in idx_array) ) yield idx_pair -""" \ No newline at end of file +""" diff --git a/src/lang_main/types.py b/src/lang_main/types.py index 85d032a..a635987 100644 --- a/src/lang_main/types.py +++ b/src/lang_main/types.py @@ -1,4 +1,4 @@ -from typing import TypeAlias, Literal +from typing import Literal, TypeAlias import numpy as np from spacy.tokens.doc import Doc as SpacyDoc @@ -6,7 +6,7 @@ from torch import Tensor LoggingLevels: TypeAlias = Literal[ 'DEBUG', - 'INFO', + 'INFO', 'WARNING', 'ERROR', 'CRITICAL', @@ -16,4 +16,4 @@ PandasIndex: TypeAlias = int | np.int64 ObjectID: TypeAlias = int Embedding: TypeAlias = SpacyDoc | Tensor -TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]] \ No newline at end of file +TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]] diff --git a/test-notebooks/Preprocess_Pipeline.ipynb b/test-notebooks/Preprocess_Pipeline.ipynb index 6525b52..c6c6626 100644 --- a/test-notebooks/Preprocess_Pipeline.ipynb +++ b/test-notebooks/Preprocess_Pipeline.ipynb @@ -13,29 +13,25 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "bca16fc4-1ffb-48ef-bd0d-bdc782428a45", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:ihm_analyse.helpers:Loaded TOML config file successfully.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\foersterflorian\\mambaforge\\envs\\ihm2\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'ihm_analyse'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONFIG\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocess\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 3\u001b[0m load_raw_data,\n\u001b[0;32m 4\u001b[0m remove_duplicates,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m merge_similarity_dupl,\n\u001b[0;32m 13\u001b[0m )\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mihm_analyse\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipelines\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BasePipeline, EmbeddingPipeline\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'ihm_analyse'" ] } ], "source": [ - "from ihm_analyse import CONFIG\n", - "from ihm_analyse.lib.preprocess import (\n", + "from lang_main import CONFIG\n", + "from lang_main.lib.preprocess import (\n", " load_raw_data,\n", " remove_duplicates,\n", " remove_NA,\n", @@ -47,8 +43,8 @@ " list_cosSim_dupl_candidates,\n", " merge_similarity_dupl,\n", ")\n", - "from ihm_analyse.lib.pipelines import BasePipeline, EmbeddingPipeline\n", - "from ihm_analyse.lib.helpers import (\n", + "from lang_main.pipelines import BasePipeline, EmbeddingPipeline\n", + "from lang_main.lib.helpers import (\n", " save_pickle, \n", " load_pickle, \n", " create_saving_folder,\n", diff --git a/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl new file mode 100644 index 0000000..bba1d89 Binary files /dev/null and b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl differ diff --git a/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl new file mode 100644 index 0000000..5565194 Binary files /dev/null and b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl differ diff --git a/test-notebooks/dashboard/app.py b/test-notebooks/dashboard/app.py index c190b0d..29689d6 100644 --- a/test-notebooks/dashboard/app.py +++ b/test-notebooks/dashboard/app.py @@ -1,28 +1,42 @@ from typing import cast +from pathlib import Path +import pandas as pd +import plotly.express as px from dash import ( Dash, - html, - dcc, - callback, - Output, Input, + Output, State, + callback, dash_table, + dcc, + html, ) -import plotly.express as px -import pandas as pd +from lang_main import load_pickle +from lang_main.types import ObjectID, TimelineCandidates from pandas import DataFrame -from lang_main import load_pickle -from lang_main.types import TimelineCandidates, ObjectID - -#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') +# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') # ** data -data = cast(DataFrame, load_pickle('./data.pkl')) -cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl')) -texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl')) +p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl') +p_tl = Path( + r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl' +) +ret = cast(DataFrame, load_pickle(p_df)) +data = ret[0] +ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl)) +cands = ret[0] +texts = ret[1] + +# p_df = Path(r'.\test-notebooks\dashboard\data.pkl') +# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl') +# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl') +# data = cast(DataFrame, load_pickle(p_df)) +# cands = cast(TimelineCandidates, load_pickle(p_cands)) +# texts = cast(dict[ObjectID, str], load_pickle(p_map)) + table_feats = [ 'ErstellungsDatum', 'ErledigungsDatum', @@ -52,25 +66,28 @@ hover_data = { app = Dash(prevent_initial_callbacks=True) app.layout = [ - html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}), - html.Div(children=[ - html.H2('Wählen Sie ein Objekt aus (ObjektID):'), - dcc.Dropdown( - list(cands.keys()), - id='dropdown-selection', - placeholder="ObjektID auswählen...", - ) - ]), - html.Div(children=[ - html.H3(id='object_text'), - dcc.Dropdown(id='choice-candidates'), - dcc.Graph(id='graph-output'), - ]), - html.Div(children=[ - dash_table.DataTable(id='table-candidates') - ]), + html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}), + html.Div( + children=[ + html.H2('Wählen Sie ein Objekt aus (ObjektID):'), + dcc.Dropdown( + list(cands.keys()), + id='dropdown-selection', + placeholder='ObjektID auswählen...', + ), + ] + ), + html.Div( + children=[ + html.H3(id='object_text'), + dcc.Dropdown(id='choice-candidates'), + dcc.Graph(id='graph-output'), + ] + ), + html.Div(children=[dash_table.DataTable(id='table-candidates')]), ] + @callback( Output('object_text', 'children'), Input('dropdown-selection', 'value'), @@ -82,6 +99,7 @@ def update_obj_text(obj_id): headline = f'HObjektText: {obj_text}' return headline + @callback( Output('choice-candidates', 'options'), Input('dropdown-selection', 'value'), @@ -90,9 +108,10 @@ def update_obj_text(obj_id): def update_choice_candidates(obj_id): obj_id = int(obj_id) cands_obj_id = cands[obj_id] - choices = list(range(1, len(cands_obj_id)+1)) + choices = list(range(1, len(cands_obj_id) + 1)) return choices + @callback( Output('graph-output', 'figure'), Input('choice-candidates', 'value'), @@ -106,7 +125,7 @@ def update_timeline(index, obj_id): title = f'HObjektText: {obj_text}' # cands cands_obj_id = cands[obj_id] - cands_choice = cands_obj_id[int(index)-1] + cands_choice = cands_obj_id[int(index) - 1] # data df = data.loc[list(cands_choice)].sort_index() # figure @@ -117,22 +136,18 @@ def update_timeline(index, obj_id): title=title, hover_data=hover_data, ) - fig.update_traces( - mode='markers+lines', - marker=markers, - marker_symbol='diamond' - ) + fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond') fig.update_xaxes( - tickformat="%B\n%Y", + tickformat='%B\n%Y', rangeslider_visible=True, ) fig.update_yaxes(type='category') - fig.update_layout(hovermode="x unified") + fig.update_layout(hovermode='x unified') return fig + @callback( - [Output('table-candidates', 'data'), - Output('table-candidates', 'columns')], + [Output('table-candidates', 'data'), Output('table-candidates', 'columns')], Input('choice-candidates', 'value'), State('dropdown-selection', 'value'), prevent_initial_call=True, @@ -141,19 +156,20 @@ def update_table_candidates(index, obj_id): obj_id = int(obj_id) # cands cands_obj_id = cands[obj_id] - cands_choice = cands_obj_id[int(index)-1] + cands_choice = cands_obj_id[int(index) - 1] # data df = data.loc[list(cands_choice)].sort_index() - df = (df - .filter(items=table_feats, axis=1) - .sort_values(by='ErstellungsDatum', ascending=True)) - cols = [{"name": i, "id": i} for i in df.columns] + df = df.filter(items=table_feats, axis=1).sort_values( + by='ErstellungsDatum', ascending=True + ) + cols = [{'name': i, 'id': i} for i in df.columns] # convert dates to strings for col in table_feats_dates: df[col] = df[col].dt.strftime(r'%Y-%m-%d') - + table_data = df.to_dict('records') return table_data, cols + if __name__ == '__main__': - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/test-notebooks/dashboard/data.pkl b/test-notebooks/dashboard/archive/data.pkl similarity index 100% rename from test-notebooks/dashboard/data.pkl rename to test-notebooks/dashboard/archive/data.pkl diff --git a/test-notebooks/dashboard/map_candidates.pkl b/test-notebooks/dashboard/archive/map_candidates.pkl similarity index 100% rename from test-notebooks/dashboard/map_candidates.pkl rename to test-notebooks/dashboard/archive/map_candidates.pkl diff --git a/test-notebooks/dashboard/map_texts.pkl b/test-notebooks/dashboard/archive/map_texts.pkl similarity index 100% rename from test-notebooks/dashboard/map_texts.pkl rename to test-notebooks/dashboard/archive/map_texts.pkl diff --git a/test-notebooks/dashboard/lang_main_config.toml b/test-notebooks/dashboard/lang_main_config.toml new file mode 100644 index 0000000..c694e25 --- /dev/null +++ b/test-notebooks/dashboard/lang_main_config.toml @@ -0,0 +1,56 @@ +# lang_main: Config file + +[paths] +inputs = './inputs/' +results = './results/test_new2/' +dataset = './01_2_Rohdaten_neu/Export4.csv' +#results = './results/Export7/' +#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' +#results = './results/Export7_trunc/' +#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' + +[control] +preprocessing = true +preprocessing_skip = false +token_analysis = false +token_analysis_skip = false +graph_postprocessing = false +graph_postprocessing_skip = false +time_analysis = false +time_analysis_skip = false + +#[export_filenames] +#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' + +[preprocess] +filename_cossim_filter_candidates = 'CosSim-FilterCandidates' +date_cols = [ + "VorgangsDatum", + "ErledigungsDatum", + "Arbeitsbeginn", + "ErstellungsDatum", +] +threshold_amount_characters = 5 +threshold_similarity = 0.8 + +[graph_postprocessing] +threshold_edge_weight = 150 + +[time_analysis.uniqueness] +threshold_unique_texts = 4 +criterion_feature = 'HObjektText' +feature_name_obj_id = 'ObjektID' + +[time_analysis.model_input] +input_features = [ + 'VorgangsTypName', + 'VorgangsArtText', + 'VorgangsBeschreibung', +] +activity_feature = 'VorgangsTypName' +activity_types = [ + 'Reparaturauftrag (Portal)', + 'Störungsmeldung', +] +threshold_num_acitivities = 1 +threshold_similarity = 0.8 \ No newline at end of file diff --git a/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl new file mode 100644 index 0000000..bba1d89 Binary files /dev/null and b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl differ diff --git a/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl new file mode 100644 index 0000000..5565194 Binary files /dev/null and b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl differ diff --git a/test-notebooks/display_results.ipynb b/test-notebooks/display_results.ipynb new file mode 100644 index 0000000..ce71331 --- /dev/null +++ b/test-notebooks/display_results.ipynb @@ -0,0 +1,663 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "id": "3760b040-985c-46ec-ba77-13f0f7a52c83", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from lang_main import load_pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "97487448-82c8-4b3d-8a1a-ccccaaac8d86", + "metadata": {}, + "outputs": [], + "source": [ + "def get_files(path: str) -> tuple[Path, ...]:\n", + " p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n", + " assert p.exists(), \"path does not exist\"\n", + " return tuple(p.glob(r'*'))" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "598f4d99-9d35-49c9-8c5d-113d4c80cecf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n", + " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n", + " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n", + "files" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "55ad4af3-87cd-4189-9309-171aba4e04a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shared:INFO | 2024-05-29 12:49:47 +0000 | Loaded file successfully.\n" + ] + } + ], + "source": [ + "file = files[-1]\n", + "ret = load_pickle(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "540f4720-a2bf-4171-8db5-8e6993d38c13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entrylennum_occurassoc_obj_idsnum_assoc_obj_ids
162Tägliche Wartungstätigkeiten nach Vorgabe des ...6692592[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...206
33Wöchentliche Sichtkontrolle / Reinigung393108[301, 304, 305, 313, 314, 323, 329, 331, 332, ...74
131Tägliche Überprüfung der Ölabscheider371619[0, 970, 2134, 2137]4
160Wöchentliche Kontrolle der C-Anlagen361265[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...11
140Halbjährliche Kontrolle des Stabbreithalters44687[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...166
..................
2559Fehler 9723 Leistungsversorgung Antrieb defekt461[211]1
2558T-Warp-Let-Off1 schleppfehler301[93]1
2557Fahrräder wurden gewartet und gereinigt.401[1707]1
2556Bohrlöcher an Gebots- und Verbotszeichen anbri...1731[1]1
6782Befestigung Deckel für Batteriefach defekt ...1062[306, 326]2
\n", + "

4545 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " entry ... num_assoc_obj_ids\n", + "162 Tägliche Wartungstätigkeiten nach Vorgabe des ... ... 206\n", + "33 Wöchentliche Sichtkontrolle / Reinigung ... 74\n", + "131 Tägliche Überprüfung der Ölabscheider ... 4\n", + "160 Wöchentliche Kontrolle der C-Anlagen ... 11\n", + "140 Halbjährliche Kontrolle des Stabbreithalters ... 166\n", + "... ... ... ...\n", + "2559 Fehler 9723 Leistungsversorgung Antrieb defekt ... 1\n", + "2558 T-Warp-Let-Off1 schleppfehler ... 1\n", + "2557 Fahrräder wurden gewartet und gereinigt. ... 1\n", + "2556 Bohrlöcher an Gebots- und Verbotszeichen anbri... ... 1\n", + "6782 Befestigung Deckel für Batteriefach defekt ... ... 2\n", + "\n", + "[4545 rows x 5 columns]" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ret[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee0fea45-c26b-4253-b7f6-95ad70d0205a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82a059ea-0eb8-4db1-b859-3fc07e42faff", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "d1c1190f-0c80-40e3-8965-78d68400a33d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-3_remove_NA.pkl'),\n", + " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-TargetFeature_Step-5_analyse_feature.pkl'),\n", + " WindowsPath('A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/Pipe-Merge_Duplicates_Step-1_merge_similarity_dupl.pkl'))" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files = get_files(r'A:\\Arbeitsaufgaben\\lang-main\\scripts\\results\\test_20240529')\n", + "files" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "e26c52eb-7a6b-49da-97a9-6e24a2a4d91e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shared:INFO | 2024-05-29 11:56:46 +0000 | Loaded file successfully.\n" + ] + } + ], + "source": [ + "file = files[-1]\n", + "ret = load_pickle(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "beacf5ca-6946-413a-817c-e7e87da9ace3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexentrylennum_occurassoc_obj_idsnum_assoc_obj_ids
0162Tägliche Wartungstätigkeiten nach Vorgabe des ...6692592[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...206
133Wöchentliche Sichtkontrolle / Reinigung393108[301, 304, 305, 313, 314, 323, 329, 331, 332, ...74
2131Tägliche Überprüfung der Ölabscheider371619[0, 970, 2134, 2137]4
3160Wöchentliche Kontrolle der C-Anlagen361265[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...11
4140Halbjährliche Kontrolle des Stabbreithalters44687[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...166
.....................
67562559Fehler 9723 Leistungsversorgung Antrieb defekt461[211]1
67572558T-Warp-Let-Off1 schleppfehler301[93]1
67582557Fahrräder wurden gewartet und gereinigt.401[1707]1
67592556Bohrlöcher an Gebots- und Verbotszeichen anbri...1731[1]1
67606782Befestigung Deckel für Batteriefach defekt ...1062[306, 326]2
\n", + "

4545 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " index ... num_assoc_obj_ids\n", + "0 162 ... 206\n", + "1 33 ... 74\n", + "2 131 ... 4\n", + "3 160 ... 11\n", + "4 140 ... 166\n", + "... ... ... ...\n", + "6756 2559 ... 1\n", + "6757 2558 ... 1\n", + "6758 2557 ... 1\n", + "6759 2556 ... 1\n", + "6760 6782 ... 2\n", + "\n", + "[4545 rows x 6 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ret[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e873f4-363e-4dbf-93f1-927b4ee3c598", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "cbf0b450-ec00-471f-9627-717e52c5471d", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "74e289ed-8d3e-4a50-afdf-d1d97e8a7807", + "metadata": {}, + "outputs": [], + "source": [ + "tup = tuple(i for i in range(100000000))" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "3e747e82-e6f8-47bb-918b-27bb7c37a10f", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6ade9c6f4e61410fb93f35e43222705b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/100000000 [00:00= 0.97)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4426f1d5-dcd2-4d64-bdca-7dece6793f8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30220" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "5b78436e-a828-42bd-a5ed-ae6045349391", + "metadata": {}, + "outputs": [], + "source": [ + "batch = idx[:200]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "75edc50e-b64c-4319-8f74-27653ed3452c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "88.5 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "tuple(map(tuple, batch))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "d9c827a4-ccdf-4cc1-90af-b018ae4858a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "94.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "tuple(tuple(x) for x in batch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb2a0c9-b7d2-463d-8e63-c52fc7754ae8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}