refactoring, improved string cleansing preprocessing

2024-05-31 09:59:22 +02:00 · 2024-05-31 09:59:22 +02:00 · 9cafc9fb97
commit 9cafc9fb97
parent bb987e2108
13 changed files with 111 additions and 98 deletions
--- a/scripts/analyse_dataset.py
+++ b/scripts/analyse_dataset.py
@ -3,11 +3,7 @@ import warnings
 from pathlib import Path
 from typing import cast

-from lang_main import (
-    TokenGraph,
-    create_saving_folder,
-    load_pickle,
-)
+from lang_main.analysis.graphs import TokenGraph
 from lang_main.constants import (
    DO_GRAPH_POSTPROCESSING,
    DO_PREPROCESSING,
@ -23,9 +19,7 @@ from lang_main.constants import (
    THRESHOLD_AMOUNT_CHARACTERS,
    THRESHOLD_EDGE_WEIGHT,
 )
-
-# Embedding,
-# PandasIndex,
+from lang_main.io import create_saving_folder, load_pickle
 from lang_main.pipelines.predefined import (
    pipe_merge,
    pipe_target_feat,
@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
    target_feat_data = ret[0]
    # only entries with more than threshold amount of characters
    data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
-    # subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
-    # dupl_idx_pairs, embds = typing.cast(
-    #     tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
-    #     pipe_embds.run(starting_values=(subset_data,)),
-    # )
-    # merge duplicates, results saved separately
    subset_data = target_feat_data.loc[data_filter].copy()
-    ret = typing.cast(
-        tuple[DataFrame],
-        # pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
-        pipe_merge.run(starting_values=(subset_data,)),
-    )
+    # merge duplicates, results saved separately
+    ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
    preprocessed_data = ret[0]

    return preprocessed_data
--- a/scripts/test.py
+++ b/scripts/test.py
@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER

 print(SAVE_PATH_FOLDER)
 txt = """
-Wir feiern den Jahrestag, olé!
+Wir feiern den Jahrestag am 23.11.2023, olé!
 tel:::: !!!!???? +++49 123 456 789

 Doch leben wir länger.
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -6,26 +6,14 @@ from pathlib import Path
 from time import gmtime
 from typing import Any, Final

-from lang_main.analysis.graphs import TokenGraph
-from lang_main.analysis.preprocessing import Embedding, PandasIndex
-from lang_main.shared import (
-    create_saving_folder,
-    load_pickle,
-    load_toml_config,
-    save_pickle,
-)
+from lang_main.io import load_toml_config

 __all__ = [
-    'save_pickle',
-    'load_pickle',
-    'create_saving_folder',
-    'Embedding',
-    'PandasIndex',
-    'TokenGraph',
+    'CALLER_PATH',
 ]

 logging.Formatter.converter = gmtime
-LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
+LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
 LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
 logging.basicConfig(
    stream=sys.stdout,
@ -35,18 +23,18 @@ logging.basicConfig(

 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 USE_INTERNAL_CONFIG: Final[bool] = False
-
 pkg_dir = Path(__file__).parent
 cfg_path_internal = pkg_dir / CONFIG_FILENAME
+caller_file = Path(inspect.stack()[-1].filename)
+CALLER_PATH: Final[Path] = caller_file.parent

 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
 else:
-    caller_file = Path(inspect.stack()[-1].filename)
+    cfg_path_external = CALLER_PATH / CONFIG_FILENAME
    if not caller_file.exists():
        raise FileNotFoundError('Caller file could not be correctly retrieved.')
-    cfg_path_external = caller_file.parent / CONFIG_FILENAME
    if not cfg_path_external.exists():
        shutil.copy(cfg_path_internal, cfg_path_external)
        sys.exit(
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -11,8 +11,8 @@ import numpy.typing as npt
 from networkx import DiGraph, Graph
 from pandas import DataFrame

+from lang_main.io import load_pickle, save_pickle
 from lang_main.loggers import logger_graphs as logger
-from lang_main.shared import load_pickle, save_pickle

 # TODO change logging behaviour, add logging to file
 LOGGING_DEFAULT: Final[bool] = False
@ -53,10 +53,10 @@ def get_graph_metadata(
    )

    if logging:
-        logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
-        logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
-        logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
-        logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
+        logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
+        logger.info('Node memory: %.2f KB', (node_mem / 1024))
+        logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
+        logger.info('Total memory: %.2f KB', (total_mem / 1024))

    return graph_info

@ -342,7 +342,7 @@ class TokenGraph(DiGraph):

        saving_path = saving_path.with_suffix('.graphml')
        nx.write_graphml(G=target_graph, path=saving_path)
-        logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
+        logger.info('Successfully saved graph as GraphML file under %s.', saving_path)

    def to_pickle(
        self,
@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
        match path.suffix:
            case '.graphml':
                graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
-                logger.info(f'Successfully loaded graph from GraphML file {path}.')
+                logger.info('Successfully loaded graph from GraphML file %s.', path)
            case '.pkl' | '.pickle':
                graph = typing.cast(Self, load_pickle(path))
-                logger.info(f'Successfully loaded graph from pickle file {path}.')
+                logger.info('Successfully loaded graph from pickle file %s.', path)
            case _:
                raise ValueError('File format not supported.')

--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
 from lang_main.pipelines.base import BasePipeline
 from lang_main.types import Embedding, PandasIndex

+# ** RE patterns
+pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
+pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
+pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+pattern_whitespace = re.compile(r'[ ]{2,}')
+

 # ** (1) dataset preparation: loading and simple preprocessing
 # following functions used to load a given dataset and perform simple
@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
        cleaned entry
    """
    # remove special chars
-    pattern = r'[\t\n\r\f\v]+'
-    string = re.sub(pattern, ' ', string)
-    pattern = r'([,;.:!?-_\+]){2,}'
+    string = pattern_special_chars.sub(' ', string)
+    string = pattern_repeated_chars.sub(r'\1', string)
+    # string = pattern_dates.sub('', string)
+    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
-    string = re.sub(pattern, r'\1', string)
    string = string.strip()

    return string
@ -185,11 +191,9 @@ def entry_wise_cleansing(
    # apply given cleansing function to target feature
    data[target_feature] = data[target_feature].map(cleansing_func)
    logger.info(
-        (
-            f'Successfully applied entry-wise cleansing procedure '
-            f'>>{cleansing_func.__name__}<< '
-            f'for feature >>{target_feature}<<'
-        )
+        ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
+        cleansing_func.__name__,
+        target_feature,
    )
    return (data,)

@ -203,7 +207,9 @@ def analyse_feature(
 ) -> tuple[DataFrame]:
    # feature columns
    feature_entries = data[target_feature]
-    logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
+    logger.info(
+        'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
+    )
    # obtain unique entries
    unique_feature_entries = feature_entries.unique()

@ -265,7 +271,7 @@ def build_embedding_map(
            # check for empty vectors
            if not embd.vector_norm:
                logger.debug('--- Unknown Words ---')
-                logger.debug(f'{embd.text=} has no vector')
+                logger.debug('embd.text: %s has no vector', embd.text)
        elif is_STRF:
            model = cast(SentenceTransformer, model)
            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
        logger.info('Saving similarity candidates...')
        target_path = saving_path.joinpath(target_filename)
        df_candidates.to_excel(target_path)
-        logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
+        logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)

    return index_pairs, embds

--- a/src/lang_main/analysis/timeline.py
+++ b/src/lang_main/analysis/timeline.py
@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
    )
    # only retain entries with ObjectIDs not in IDs to ignore
    data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
-    logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
+    logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
    logger.info('Non-relevant ObjectIDs removed successfully')

    return (data,)
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
 )
 from lang_main.loggers import logger_token_analysis as logger

-# ** Logging
-# LOGGING_LEVEL = 'INFO'
-# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
-# logger = logging.getLogger('ihm_analyse.token_analysis')
-
 # ** POS
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
@ -82,10 +77,11 @@ def obtain_relevant_descendants(
            continue

        logger.debug(
-            (
-                f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
-                f'>>{descendant}<<, POS >>{descendant.pos_}<<'
-            )
+            'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
+            token,
+            token.pos_,
+            descendant,
+            descendant.pos_,
        )

        # eliminate cases of cross-references with verbs
--- a/src/lang_main/shared.py
+++ b/src/lang_main/shared.py
@ -26,10 +26,10 @@ def create_saving_folder(
        else:
            logger.info(
                (
-                    f'Path >>{saving_path_folder}<< already exists and remained '
-                    f'unchanged. If you want to overwrite this path, use parameter '
-                    f'>>overwrite_existing<<.'
-                )
+                    'Path >>%s<< already exists and remained unchanged. If you want to '
+                    'overwrite this path, use parameter >>overwrite_existing<<.',
+                ),
+                saving_path_folder,
            )


@ -50,7 +50,7 @@ def save_pickle(
 ) -> None:
    with open(path, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
-    logger.info(f'Saved file successfully under {path}')
+    logger.info('Saved file successfully under %s', path)


 def load_pickle(
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -3,12 +3,13 @@ from typing import Final

 from lang_main.types import LoggingLevels

-LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
-LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
-LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
+# ** logging
+LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO

 logger_shared_helpers = logging.getLogger('lang_main.shared')
 logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Any

 from lang_main.loggers import logger_pipelines as logger
-from lang_main.shared import load_pickle, save_pickle
+from lang_main.io import load_pickle, save_pickle

 # ** pipelines to perform given actions on dataset in a customisable manner

@ -110,13 +110,13 @@ class BasePipeline:
        return data

    def prep_run(self) -> None:
-        logger.info(f'Starting processing pipeline >>{self.name}<<...')
+        logger.info('Starting processing pipeline >>%s<<...', self.name)
        # progress tracking
        self.curr_proc_idx = 1
        # check if performable actions available
        if len(self.actions) == 0:
            raise NoPerformableActionError(
-                ('The pipeline does not contain any ' 'performable actions.')
+                'The pipeline does not contain any performable actions.'
            )

    def run(
@ -139,6 +139,6 @@ class BasePipeline:
            # processing tracking
            self.curr_proc_idx += 1

-        logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
+        logger.info('Processing pipeline >>%s<< successfully ended.', self.name)

        return ret
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -1,16 +1,18 @@
-from typing import Literal, TypeAlias
+import enum
+from typing import TypeAlias

 import numpy as np
 from spacy.tokens.doc import Doc as SpacyDoc
 from torch import Tensor

-LoggingLevels: TypeAlias = Literal[
-    'DEBUG',
-    'INFO',
-    'WARNING',
-    'ERROR',
-    'CRITICAL',
-]
+
+class LoggingLevels(enum.IntEnum):
+    DEBUG = 10
+    INFO = 20
+    WARNING = 30
+    ERROR = 40
+    CRITICAL = 50
+

 PandasIndex: TypeAlias = int | np.int64
 ObjectID: TypeAlias = int
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@ -1,5 +1,8 @@
-from typing import cast
+import time
+import webbrowser
 from pathlib import Path
+from threading import Thread
+from typing import cast

 import pandas as pd
 import plotly.express as px
@ -13,17 +16,20 @@ from dash import (
    dcc,
    html,
 )
-from lang_main import load_pickle
+from lang_main import CALLER_PATH
+from lang_main.io import load_pickle
 from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame

 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')

 # ** data
-p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
-p_tl = Path(
-    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
-)
+# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
+p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
+# p_tl = Path(
+#     r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
+# )
+p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
 ret = cast(DataFrame, load_pickle(p_df))
 data = ret[0]
 ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
    return table_data, cols


-if __name__ == '__main__':
+def _start_webbrowser():
+    host = '127.0.0.1'
+    port = '8050'
+    adress = f'http://{host}:{port}/'
+    time.sleep(2)
+    webbrowser.open_new(adress)
+
+
+def main():
+    webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
+    webbrowser_thread.start()
    app.run(debug=True)
+
+
+if __name__ == '__main__':
+    main()
--- a/tests/pre_test_examples.py
+++ b/tests/pre_test_examples.py
@ -0,0 +1,15 @@
+import re
+
+
+string = """
+Hallo mein Name ist Max Mustermann    und ich bin am 01.01.2024 geboren.
+"""
+
+patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
+patt2 = r'[ ]{2,}'
+pattern = re.compile(patt)
+pattern2 = re.compile(patt2)
+res = pattern.sub('', string)
+res = pattern2.sub(' ', res)
+
+print(res)