From 9cafc9fb975091fd49f38c5c130f3000d0985f63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Florian=20F=C3=B6rster?=
 <florian.foerster@mb.tu-chemnitz.de>
Date: Fri, 31 May 2024 09:59:22 +0200
Subject: [PATCH] refactoring, improved string cleansing preprocessing

---
 scripts/analyse_dataset.py              | 23 +++--------------
 scripts/test.py                         |  2 +-
 src/lang_main/__init__.py               | 24 +++++------------
 src/lang_main/analysis/graphs.py        | 16 ++++++------
 src/lang_main/analysis/preprocessing.py | 30 +++++++++++++---------
 src/lang_main/analysis/timeline.py      |  2 +-
 src/lang_main/analysis/tokens.py        | 14 ++++------
 src/lang_main/{shared.py => io.py}      | 10 ++++----
 src/lang_main/loggers.py                | 13 +++++-----
 src/lang_main/pipelines/base.py         |  8 +++---
 src/lang_main/types.py                  | 18 +++++++------
 test-notebooks/dashboard/app.py         | 34 ++++++++++++++++++++-----
 tests/pre_test_examples.py              | 15 +++++++++++
 13 files changed, 111 insertions(+), 98 deletions(-)
 rename src/lang_main/{shared.py => io.py} (91%)
 create mode 100644 tests/pre_test_examples.py

diff --git a/scripts/analyse_dataset.py b/scripts/analyse_dataset.py
index 2316ede..766d7be 100644
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@@ -3,11 +3,7 @@ import warnings
 from pathlib import Path
 from typing import cast
 
-from lang_main import (
-    TokenGraph,
-    create_saving_folder,
-    load_pickle,
-)
+from lang_main.analysis.graphs import TokenGraph
 from lang_main.constants import (
     DO_GRAPH_POSTPROCESSING,
     DO_PREPROCESSING,
@@ -23,9 +19,7 @@ from lang_main.constants import (
     THRESHOLD_AMOUNT_CHARACTERS,
     THRESHOLD_EDGE_WEIGHT,
 )
-
-# Embedding,
-# PandasIndex,
+from lang_main.io import create_saving_folder, load_pickle
 from lang_main.pipelines.predefined import (
     pipe_merge,
     pipe_target_feat,
@@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
     target_feat_data = ret[0]
     # only entries with more than threshold amount of characters
     data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-    # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
-    # dupl_idx_pairs, embds = typing.cast(
-    #     tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
-    #     pipe_embds.run(starting_values=(subset_data,)),
-    # )
-    # merge duplicates, results saved separately
     subset_data = target_feat_data.loc[data_filter].copy()
-    ret = typing.cast(
-        tuple[DataFrame],
-        # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
-        pipe_merge.run(starting_values=(subset_data,)),
-    )
+    # merge duplicates, results saved separately
+    ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
     preprocessed_data = ret[0]
 
     return preprocessed_data
diff --git a/scripts/test.py b/scripts/test.py
index 8076042..62dc3f9 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
 
 print(SAVE_PATH_FOLDER)
 txt = """
-Wir feiern den Jahrestag, olé!
+Wir feiern den Jahrestag am 23.11.2023, olé!
 tel:::: !!!!???? +++49 123 456 789
 
 Doch leben wir länger.
diff --git a/src/lang_main/__init__.py b/src/lang_main/__init__.py
index 85a218e..a8332ee 100644
--- a/src/lang_main/__init__.py
+++ b/src/lang_main/__init__.py
@@ -6,26 +6,14 @@ from pathlib import Path
 from time import gmtime
 from typing import Any, Final
 
-from lang_main.analysis.graphs import TokenGraph
-from lang_main.analysis.preprocessing import Embedding, PandasIndex
-from lang_main.shared import (
-    create_saving_folder,
-    load_pickle,
-    load_toml_config,
-    save_pickle,
-)
+from lang_main.io import load_toml_config
 
 __all__ = [
-    'save_pickle',
-    'load_pickle',
-    'create_saving_folder',
-    'Embedding',
-    'PandasIndex',
-    'TokenGraph',
+    'CALLER_PATH',
 ]
 
 logging.Formatter.converter = gmtime
-LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
+LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
 LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
 logging.basicConfig(
     stream=sys.stdout,
@@ -35,18 +23,18 @@ logging.basicConfig(
 
 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 USE_INTERNAL_CONFIG: Final[bool] = False
-
 pkg_dir = Path(__file__).parent
 cfg_path_internal = pkg_dir / CONFIG_FILENAME
+caller_file = Path(inspect.stack()[-1].filename)
+CALLER_PATH: Final[Path] = caller_file.parent
 
 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
     loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
 else:
-    caller_file = Path(inspect.stack()[-1].filename)
+    cfg_path_external = CALLER_PATH / CONFIG_FILENAME
     if not caller_file.exists():
         raise FileNotFoundError('Caller file could not be correctly retrieved.')
-    cfg_path_external = caller_file.parent / CONFIG_FILENAME
     if not cfg_path_external.exists():
         shutil.copy(cfg_path_internal, cfg_path_external)
         sys.exit(
diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py
index dd74ebc..562594f 100644
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@@ -11,8 +11,8 @@ import numpy.typing as npt
 from networkx import DiGraph, Graph
 from pandas import DataFrame
 
+from lang_main.io import load_pickle, save_pickle
 from lang_main.loggers import logger_graphs as logger
-from lang_main.shared import load_pickle, save_pickle
 
 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False
@@ -53,10 +53,10 @@ def get_graph_metadata(
     )
 
     if logging:
-        logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
-        logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
-        logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
-        logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
+        logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
+        logger.info('Node memory: %.2f KB', (node_mem / 1024))
+        logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
+        logger.info('Total memory: %.2f KB', (total_mem / 1024))
 
     return graph_info
 
@@ -342,7 +342,7 @@ class TokenGraph(DiGraph):
 
         saving_path = saving_path.with_suffix('.graphml')
         nx.write_graphml(G=target_graph, path=saving_path)
-        logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
+        logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
 
     def to_pickle(
         self,
@@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
         match path.suffix:
             case '.graphml':
                 graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
-                logger.info(f'Successfully loaded graph from GraphML file {path}.')
+                logger.info('Successfully loaded graph from GraphML file %s.', path)
             case '.pkl' | '.pickle':
                 graph = typing.cast(Self, load_pickle(path))
-                logger.info(f'Successfully loaded graph from pickle file {path}.')
+                logger.info('Successfully loaded graph from pickle file %s.', path)
             case _:
                 raise ValueError('File format not supported.')
 
diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py
index 059f6b9..f290171 100644
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import BasePipeline
 from lang_main.types import Embedding, PandasIndex
 
+# ** RE patterns
+pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
+pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
+pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+pattern_whitespace = re.compile(r'[ ]{2,}')
+
 
 # ** (1) dataset preparation: loading and simple preprocessing
 # following functions used to load a given dataset and perform simple
@@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
         cleaned entry
     """
     # remove special chars
-    pattern = r'[\t\n\r\f\v]+'
-    string = re.sub(pattern, ' ', string)
-    pattern = r'([,;.:!?-_\+]){2,}'
+    string = pattern_special_chars.sub(' ', string)
+    string = pattern_repeated_chars.sub(r'\1', string)
+    # string = pattern_dates.sub('', string)
+    string = pattern_whitespace.sub(' ', string)
     # remove whitespaces at the beginning and the end
-    string = re.sub(pattern, r'\1', string)
     string = string.strip()
 
     return string
@@ -185,11 +191,9 @@ def entry_wise_cleansing(
     # apply given cleansing function to target feature
     data[target_feature] = data[target_feature].map(cleansing_func)
     logger.info(
-        (
-            f'Successfully applied entry-wise cleansing procedure '
-            f'>>{cleansing_func.__name__}<< '
-            f'for feature >>{target_feature}<<'
-        )
+        ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
+        cleansing_func.__name__,
+        target_feature,
     )
     return (data,)
 
@@ -203,7 +207,9 @@ def analyse_feature(
 ) -> tuple[DataFrame]:
     # feature columns
     feature_entries = data[target_feature]
-    logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
+    logger.info(
+        'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
+    )
     # obtain unique entries
     unique_feature_entries = feature_entries.unique()
 
@@ -265,7 +271,7 @@ def build_embedding_map(
             # check for empty vectors
             if not embd.vector_norm:
                 logger.debug('--- Unknown Words ---')
-                logger.debug(f'{embd.text=} has no vector')
+                logger.debug('embd.text: %s has no vector', embd.text)
         elif is_STRF:
             model = cast(SentenceTransformer, model)
             embd = cast(Tensor, model.encode(text, show_progress_bar=False))
@@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
         logger.info('Saving similarity candidates...')
         target_path = saving_path.joinpath(target_filename)
         df_candidates.to_excel(target_path)
-        logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
+        logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
 
     return index_pairs, embds
 
diff --git a/src/lang_main/analysis/timeline.py b/src/lang_main/analysis/timeline.py
index 9d90c7c..e010ff2 100644
--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
     )
     # only retain entries with ObjectIDs not in IDs to ignore
     data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
-    logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
+    logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
     logger.info('Non-relevant ObjectIDs removed successfully')
 
     return (data,)
diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py
index cf4efb2..7adf9c9 100644
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
 )
 from lang_main.loggers import logger_token_analysis as logger
 
-# ** Logging
-# LOGGING_LEVEL = 'INFO'
-# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-# logger = logging.getLogger('ihm_analyse.token_analysis')
-
 # ** POS
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
@@ -82,10 +77,11 @@ def obtain_relevant_descendants(
             continue
 
         logger.debug(
-            (
-                f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
-                f'>>{descendant}<<, POS >>{descendant.pos_}<<'
-            )
+            'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
+            token,
+            token.pos_,
+            descendant,
+            descendant.pos_,
         )
 
         # eliminate cases of cross-references with verbs
diff --git a/src/lang_main/shared.py b/src/lang_main/io.py
similarity index 91%
rename from src/lang_main/shared.py
rename to src/lang_main/io.py
index e44139f..bc19a1a 100644
--- a/src/lang_main/shared.py
+++ b/src/lang_main/io.py
@@ -26,10 +26,10 @@ def create_saving_folder(
         else:
             logger.info(
                 (
-                    f'Path >>{saving_path_folder}<< already exists and remained '
-                    f'unchanged. If you want to overwrite this path, use parameter '
-                    f'>>overwrite_existing<<.'
-                )
+                    'Path >>%s<< already exists and remained unchanged. If you want to '
+                    'overwrite this path, use parameter >>overwrite_existing<<.',
+                ),
+                saving_path_folder,
             )
 
 
@@ -50,7 +50,7 @@ def save_pickle(
 ) -> None:
     with open(path, 'wb') as file:
         pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
-    logger.info(f'Saved file successfully under {path}')
+    logger.info('Saved file successfully under %s', path)
 
 
 def load_pickle(
diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py
index eadbb4d..eecb00b 100644
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@@ -3,12 +3,13 @@ from typing import Final
 
 from lang_main.types import LoggingLevels
 
-LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
-LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
+# ** logging
+LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
 
 logger_shared_helpers = logging.getLogger('lang_main.shared')
 logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
diff --git a/src/lang_main/pipelines/base.py b/src/lang_main/pipelines/base.py
index ad78589..d1da557 100644
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any
 
 from lang_main.loggers import logger_pipelines as logger
-from lang_main.shared import load_pickle, save_pickle
+from lang_main.io import load_pickle, save_pickle
 
 # ** pipelines to perform given actions on dataset in a customisable manner
 
@@ -110,13 +110,13 @@ class BasePipeline:
         return data
 
     def prep_run(self) -> None:
-        logger.info(f'Starting processing pipeline >>{self.name}<<...')
+        logger.info('Starting processing pipeline >>%s<<...', self.name)
         # progress tracking
         self.curr_proc_idx = 1
         # check if performable actions available
         if len(self.actions) == 0:
             raise NoPerformableActionError(
-                ('The pipeline does not contain any ' 'performable actions.')
+                'The pipeline does not contain any performable actions.'
             )
 
     def run(
@@ -139,6 +139,6 @@ class BasePipeline:
             # processing tracking
             self.curr_proc_idx += 1
 
-        logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
+        logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
 
         return ret
diff --git a/src/lang_main/types.py b/src/lang_main/types.py
index a635987..e9f1c77 100644
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@@ -1,16 +1,18 @@
-from typing import Literal, TypeAlias
+import enum
+from typing import TypeAlias
 
 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor
 
-LoggingLevels: TypeAlias = Literal[
-    'DEBUG',
-    'INFO',
-    'WARNING',
-    'ERROR',
-    'CRITICAL',
-]
+
+class LoggingLevels(enum.IntEnum):
+    DEBUG = 10
+    INFO = 20
+    WARNING = 30
+    ERROR = 40
+    CRITICAL = 50
+
 
 PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
diff --git a/test-notebooks/dashboard/app.py b/test-notebooks/dashboard/app.py
index 29689d6..21ae693 100644
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@@ -1,5 +1,8 @@
-from typing import cast
+import time
+import webbrowser
 from pathlib import Path
+from threading import Thread
+from typing import cast
 
 import pandas as pd
 import plotly.express as px
@@ -13,17 +16,20 @@ from dash import (
     dcc,
     html,
 )
-from lang_main import load_pickle
+from lang_main import CALLER_PATH
+from lang_main.io import load_pickle
 from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame
 
 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
 
 # ** data
-p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
-p_tl = Path(
-    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
-)
+# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
+p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
+# p_tl = Path(
+#     r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
+# )
+p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
 ret = cast(DataFrame, load_pickle(p_df))
 data = ret[0]
 ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
@@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
     return table_data, cols
 
 
-if __name__ == '__main__':
+def _start_webbrowser():
+    host = '127.0.0.1'
+    port = '8050'
+    adress = f'http://{host}:{port}/'
+    time.sleep(2)
+    webbrowser.open_new(adress)
+
+
+def main():
+    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
+    webbrowser_thread.start()
     app.run(debug=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/pre_test_examples.py b/tests/pre_test_examples.py
new file mode 100644
index 0000000..85d3b5f
--- /dev/null
+++ b/tests/pre_test_examples.py
@@ -0,0 +1,15 @@
+import re
+
+
+string = """
+Hallo mein Name ist Max Mustermann    und ich bin am 01.01.2024 geboren.
+"""
+
+patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
+patt2 = r'[ ]{2,}'
+pattern = re.compile(patt)
+pattern2 = re.compile(patt2)
+res = pattern.sub('', string)
+res = pattern2.sub(' ', res)
+
+print(res)