From 9cafc9fb975091fd49f38c5c130f3000d0985f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20F=C3=B6rster?= Date: Fri, 31 May 2024 09:59:22 +0200 Subject: [PATCH] refactoring, improved string cleansing preprocessing --- scripts/analyse_dataset.py | 23 +++-------------- scripts/test.py | 2 +- src/lang_main/__init__.py | 24 +++++------------ src/lang_main/analysis/graphs.py | 16 ++++++------ src/lang_main/analysis/preprocessing.py | 30 +++++++++++++--------- src/lang_main/analysis/timeline.py | 2 +- src/lang_main/analysis/tokens.py | 14 ++++------ src/lang_main/{shared.py => io.py} | 10 ++++---- src/lang_main/loggers.py | 13 +++++----- src/lang_main/pipelines/base.py | 8 +++--- src/lang_main/types.py | 18 +++++++------ test-notebooks/dashboard/app.py | 34 ++++++++++++++++++++----- tests/pre_test_examples.py | 15 +++++++++++ 13 files changed, 111 insertions(+), 98 deletions(-) rename src/lang_main/{shared.py => io.py} (91%) create mode 100644 tests/pre_test_examples.py diff --git a/scripts/analyse_dataset.py b/scripts/analyse_dataset.py index 2316ede..766d7be 100644 --- a/scripts/analyse_dataset.py +++ b/scripts/analyse_dataset.py @@ -3,11 +3,7 @@ import warnings from pathlib import Path from typing import cast -from lang_main import ( - TokenGraph, - create_saving_folder, - load_pickle, -) +from lang_main.analysis.graphs import TokenGraph from lang_main.constants import ( DO_GRAPH_POSTPROCESSING, DO_PREPROCESSING, @@ -23,9 +19,7 @@ from lang_main.constants import ( THRESHOLD_AMOUNT_CHARACTERS, THRESHOLD_EDGE_WEIGHT, ) - -# Embedding, -# PandasIndex, +from lang_main.io import create_saving_folder, load_pickle from lang_main.pipelines.predefined import ( pipe_merge, pipe_target_feat, @@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame: target_feat_data = ret[0] # only entries with more than threshold amount of characters data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) - # subset_data = target_feat_data.loc[data_filter, 'entry'].copy() - # dupl_idx_pairs, embds = typing.cast( - # tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]], - # pipe_embds.run(starting_values=(subset_data,)), - # ) - # merge duplicates, results saved separately subset_data = target_feat_data.loc[data_filter].copy() - ret = typing.cast( - tuple[DataFrame], - # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)), - pipe_merge.run(starting_values=(subset_data,)), - ) + # merge duplicates, results saved separately + ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,))) preprocessed_data = ret[0] return preprocessed_data diff --git a/scripts/test.py b/scripts/test.py index 8076042..62dc3f9 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER print(SAVE_PATH_FOLDER) txt = """ -Wir feiern den Jahrestag, olé! +Wir feiern den Jahrestag am 23.11.2023, olé! tel:::: !!!!???? +++49 123 456 789 Doch leben wir länger. diff --git a/src/lang_main/__init__.py b/src/lang_main/__init__.py index 85a218e..a8332ee 100644 --- a/src/lang_main/__init__.py +++ b/src/lang_main/__init__.py @@ -6,26 +6,14 @@ from pathlib import Path from time import gmtime from typing import Any, Final -from lang_main.analysis.graphs import TokenGraph -from lang_main.analysis.preprocessing import Embedding, PandasIndex -from lang_main.shared import ( - create_saving_folder, - load_pickle, - load_toml_config, - save_pickle, -) +from lang_main.io import load_toml_config __all__ = [ - 'save_pickle', - 'load_pickle', - 'create_saving_folder', - 'Embedding', - 'PandasIndex', - 'TokenGraph', + 'CALLER_PATH', ] logging.Formatter.converter = gmtime -LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s' +LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s' LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000' logging.basicConfig( stream=sys.stdout, @@ -35,18 +23,18 @@ logging.basicConfig( CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' USE_INTERNAL_CONFIG: Final[bool] = False - pkg_dir = Path(__file__).parent cfg_path_internal = pkg_dir / CONFIG_FILENAME +caller_file = Path(inspect.stack()[-1].filename) +CALLER_PATH: Final[Path] = caller_file.parent # load config data: internal/external if USE_INTERNAL_CONFIG: loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal) else: - caller_file = Path(inspect.stack()[-1].filename) + cfg_path_external = CALLER_PATH / CONFIG_FILENAME if not caller_file.exists(): raise FileNotFoundError('Caller file could not be correctly retrieved.') - cfg_path_external = caller_file.parent / CONFIG_FILENAME if not cfg_path_external.exists(): shutil.copy(cfg_path_internal, cfg_path_external) sys.exit( diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py index dd74ebc..562594f 100644 --- a/src/lang_main/analysis/graphs.py +++ b/src/lang_main/analysis/graphs.py @@ -11,8 +11,8 @@ import numpy.typing as npt from networkx import DiGraph, Graph from pandas import DataFrame +from lang_main.io import load_pickle, save_pickle from lang_main.loggers import logger_graphs as logger -from lang_main.shared import load_pickle, save_pickle # TODO change logging behaviour, add logging to file LOGGING_DEFAULT: Final[bool] = False @@ -53,10 +53,10 @@ def get_graph_metadata( ) if logging: - logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges')) - logger.info(f'Node memory: {node_mem / 1024:.2f} KB') - logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB') - logger.info(f'Total memory: {total_mem / 1024:.2f} KB') + logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges) + logger.info('Node memory: %.2f KB', (node_mem / 1024)) + logger.info('Edge memory: %.2f KB', (edge_mem / 1024)) + logger.info('Total memory: %.2f KB', (total_mem / 1024)) return graph_info @@ -342,7 +342,7 @@ class TokenGraph(DiGraph): saving_path = saving_path.with_suffix('.graphml') nx.write_graphml(G=target_graph, path=saving_path) - logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.')) + logger.info('Successfully saved graph as GraphML file under %s.', saving_path) def to_pickle( self, @@ -374,10 +374,10 @@ class TokenGraph(DiGraph): match path.suffix: case '.graphml': graph = typing.cast(Self, nx.read_graphml(path, node_type=int)) - logger.info(f'Successfully loaded graph from GraphML file {path}.') + logger.info('Successfully loaded graph from GraphML file %s.', path) case '.pkl' | '.pickle': graph = typing.cast(Self, load_pickle(path)) - logger.info(f'Successfully loaded graph from pickle file {path}.') + logger.info('Successfully loaded graph from pickle file %s.', path) case _: raise ValueError('File format not supported.') diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py index 059f6b9..f290171 100644 --- a/src/lang_main/analysis/preprocessing.py +++ b/src/lang_main/analysis/preprocessing.py @@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger from lang_main.pipelines.base import BasePipeline from lang_main.types import Embedding, PandasIndex +# ** RE patterns +pattern_special_chars = re.compile(r'[\t\n\r\f\v]+') +pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}') +pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?') +pattern_whitespace = re.compile(r'[ ]{2,}') + # ** (1) dataset preparation: loading and simple preprocessing # following functions used to load a given dataset and perform simple @@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str: cleaned entry """ # remove special chars - pattern = r'[\t\n\r\f\v]+' - string = re.sub(pattern, ' ', string) - pattern = r'([,;.:!?-_\+]){2,}' + string = pattern_special_chars.sub(' ', string) + string = pattern_repeated_chars.sub(r'\1', string) + # string = pattern_dates.sub('', string) + string = pattern_whitespace.sub(' ', string) # remove whitespaces at the beginning and the end - string = re.sub(pattern, r'\1', string) string = string.strip() return string @@ -185,11 +191,9 @@ def entry_wise_cleansing( # apply given cleansing function to target feature data[target_feature] = data[target_feature].map(cleansing_func) logger.info( - ( - f'Successfully applied entry-wise cleansing procedure ' - f'>>{cleansing_func.__name__}<< ' - f'for feature >>{target_feature}<<' - ) + ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'), + cleansing_func.__name__, + target_feature, ) return (data,) @@ -203,7 +207,9 @@ def analyse_feature( ) -> tuple[DataFrame]: # feature columns feature_entries = data[target_feature] - logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}') + logger.info( + 'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries) + ) # obtain unique entries unique_feature_entries = feature_entries.unique() @@ -265,7 +271,7 @@ def build_embedding_map( # check for empty vectors if not embd.vector_norm: logger.debug('--- Unknown Words ---') - logger.debug(f'{embd.text=} has no vector') + logger.debug('embd.text: %s has no vector', embd.text) elif is_STRF: model = cast(SentenceTransformer, model) embd = cast(Tensor, model.encode(text, show_progress_bar=False)) @@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates( logger.info('Saving similarity candidates...') target_path = saving_path.joinpath(target_filename) df_candidates.to_excel(target_path) - logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.') + logger.info('Similarity candidates saved successfully to >>%s<<.', target_path) return index_pairs, embds diff --git a/src/lang_main/analysis/timeline.py b/src/lang_main/analysis/timeline.py index 9d90c7c..e010ff2 100644 --- a/src/lang_main/analysis/timeline.py +++ b/src/lang_main/analysis/timeline.py @@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids( ) # only retain entries with ObjectIDs not in IDs to ignore data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] - logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}') + logger.debug('Ignored ObjectIDs: %s', ids_to_ignore) logger.info('Non-relevant ObjectIDs removed successfully') return (data,) diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py index cf4efb2..7adf9c9 100644 --- a/src/lang_main/analysis/tokens.py +++ b/src/lang_main/analysis/tokens.py @@ -16,11 +16,6 @@ from lang_main.analysis.graphs import ( ) from lang_main.loggers import logger_token_analysis as logger -# ** Logging -# LOGGING_LEVEL = 'INFO' -# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout) -# logger = logging.getLogger('ihm_analyse.token_analysis') - # ** POS # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) @@ -82,10 +77,11 @@ def obtain_relevant_descendants( continue logger.debug( - ( - f'Token >>{token}<<, POS >>{token.pos_}<< | descendant ' - f'>>{descendant}<<, POS >>{descendant.pos_}<<' - ) + 'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<', + token, + token.pos_, + descendant, + descendant.pos_, ) # eliminate cases of cross-references with verbs diff --git a/src/lang_main/shared.py b/src/lang_main/io.py similarity index 91% rename from src/lang_main/shared.py rename to src/lang_main/io.py index e44139f..bc19a1a 100644 --- a/src/lang_main/shared.py +++ b/src/lang_main/io.py @@ -26,10 +26,10 @@ def create_saving_folder( else: logger.info( ( - f'Path >>{saving_path_folder}<< already exists and remained ' - f'unchanged. If you want to overwrite this path, use parameter ' - f'>>overwrite_existing<<.' - ) + 'Path >>%s<< already exists and remained unchanged. If you want to ' + 'overwrite this path, use parameter >>overwrite_existing<<.', + ), + saving_path_folder, ) @@ -50,7 +50,7 @@ def save_pickle( ) -> None: with open(path, 'wb') as file: pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) - logger.info(f'Saved file successfully under {path}') + logger.info('Saved file successfully under %s', path) def load_pickle( diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py index eadbb4d..eecb00b 100644 --- a/src/lang_main/loggers.py +++ b/src/lang_main/loggers.py @@ -3,12 +3,13 @@ from typing import Final from lang_main.types import LoggingLevels -LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO' -LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO' -LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO' -LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG' -LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO' -LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO' +# ** logging +LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO +LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO +LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO +LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG +LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO +LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO logger_shared_helpers = logging.getLogger('lang_main.shared') logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS) diff --git a/src/lang_main/pipelines/base.py b/src/lang_main/pipelines/base.py index ad78589..d1da557 100644 --- a/src/lang_main/pipelines/base.py +++ b/src/lang_main/pipelines/base.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any from lang_main.loggers import logger_pipelines as logger -from lang_main.shared import load_pickle, save_pickle +from lang_main.io import load_pickle, save_pickle # ** pipelines to perform given actions on dataset in a customisable manner @@ -110,13 +110,13 @@ class BasePipeline: return data def prep_run(self) -> None: - logger.info(f'Starting processing pipeline >>{self.name}<<...') + logger.info('Starting processing pipeline >>%s<<...', self.name) # progress tracking self.curr_proc_idx = 1 # check if performable actions available if len(self.actions) == 0: raise NoPerformableActionError( - ('The pipeline does not contain any ' 'performable actions.') + 'The pipeline does not contain any performable actions.' ) def run( @@ -139,6 +139,6 @@ class BasePipeline: # processing tracking self.curr_proc_idx += 1 - logger.info(f'Processing pipeline >>{self.name}<< successfully ended.') + logger.info('Processing pipeline >>%s<< successfully ended.', self.name) return ret diff --git a/src/lang_main/types.py b/src/lang_main/types.py index a635987..e9f1c77 100644 --- a/src/lang_main/types.py +++ b/src/lang_main/types.py @@ -1,16 +1,18 @@ -from typing import Literal, TypeAlias +import enum +from typing import TypeAlias import numpy as np from spacy.tokens.doc import Doc as SpacyDoc from torch import Tensor -LoggingLevels: TypeAlias = Literal[ - 'DEBUG', - 'INFO', - 'WARNING', - 'ERROR', - 'CRITICAL', -] + +class LoggingLevels(enum.IntEnum): + DEBUG = 10 + INFO = 20 + WARNING = 30 + ERROR = 40 + CRITICAL = 50 + PandasIndex: TypeAlias = int | np.int64 ObjectID: TypeAlias = int diff --git a/test-notebooks/dashboard/app.py b/test-notebooks/dashboard/app.py index 29689d6..21ae693 100644 --- a/test-notebooks/dashboard/app.py +++ b/test-notebooks/dashboard/app.py @@ -1,5 +1,8 @@ -from typing import cast +import time +import webbrowser from pathlib import Path +from threading import Thread +from typing import cast import pandas as pd import plotly.express as px @@ -13,17 +16,20 @@ from dash import ( dcc, html, ) -from lang_main import load_pickle +from lang_main import CALLER_PATH +from lang_main.io import load_pickle from lang_main.types import ObjectID, TimelineCandidates from pandas import DataFrame # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') # ** data -p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl') -p_tl = Path( - r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl' -) +# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl') +p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl') +# p_tl = Path( +# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl' +# ) +p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl') ret = cast(DataFrame, load_pickle(p_df)) data = ret[0] ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl)) @@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id): return table_data, cols -if __name__ == '__main__': +def _start_webbrowser(): + host = '127.0.0.1' + port = '8050' + adress = f'http://{host}:{port}/' + time.sleep(2) + webbrowser.open_new(adress) + + +def main(): + webbrowser_thread = Thread(target=_start_webbrowser, daemon=True) + webbrowser_thread.start() app.run(debug=True) + + +if __name__ == '__main__': + main() diff --git a/tests/pre_test_examples.py b/tests/pre_test_examples.py new file mode 100644 index 0000000..85d3b5f --- /dev/null +++ b/tests/pre_test_examples.py @@ -0,0 +1,15 @@ +import re + + +string = """ +Hallo mein Name ist Max Mustermann und ich bin am 01.01.2024 geboren. +""" + +patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?' +patt2 = r'[ ]{2,}' +pattern = re.compile(patt) +pattern2 = re.compile(patt2) +res = pattern.sub('', string) +res = pattern2.sub(' ', res) + +print(res)