refactoring, improved string cleansing preprocessing

This commit is contained in:
Florian Förster 2024-05-31 09:59:22 +02:00
parent bb987e2108
commit 9cafc9fb97
13 changed files with 111 additions and 98 deletions

View File

@ -3,11 +3,7 @@ import warnings
from pathlib import Path
from typing import cast
from lang_main import (
TokenGraph,
create_saving_folder,
load_pickle,
)
from lang_main.analysis.graphs import TokenGraph
from lang_main.constants import (
DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING,
@ -23,9 +19,7 @@ from lang_main.constants import (
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
)
# Embedding,
# PandasIndex,
from lang_main.io import create_saving_folder, load_pickle
from lang_main.pipelines.predefined import (
pipe_merge,
pipe_target_feat,
@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
target_feat_data = ret[0]
# only entries with more than threshold amount of characters
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
# dupl_idx_pairs, embds = typing.cast(
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
# pipe_embds.run(starting_values=(subset_data,)),
# )
# merge duplicates, results saved separately
subset_data = target_feat_data.loc[data_filter].copy()
ret = typing.cast(
tuple[DataFrame],
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
pipe_merge.run(starting_values=(subset_data,)),
)
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
preprocessed_data = ret[0]
return preprocessed_data

View File

@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
print(SAVE_PATH_FOLDER)
txt = """
Wir feiern den Jahrestag, olé!
Wir feiern den Jahrestag am 23.11.2023, olé!
tel:::: !!!!???? +++49 123 456 789
Doch leben wir länger.

View File

@ -6,26 +6,14 @@ from pathlib import Path
from time import gmtime
from typing import Any, Final
from lang_main.analysis.graphs import TokenGraph
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.shared import (
create_saving_folder,
load_pickle,
load_toml_config,
save_pickle,
)
from lang_main.io import load_toml_config
__all__ = [
'save_pickle',
'load_pickle',
'create_saving_folder',
'Embedding',
'PandasIndex',
'TokenGraph',
'CALLER_PATH',
]
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
@ -35,18 +23,18 @@ logging.basicConfig(
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME
caller_file = Path(inspect.stack()[-1].filename)
CALLER_PATH: Final[Path] = caller_file.parent
# load config data: internal/external
if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
caller_file = Path(inspect.stack()[-1].filename)
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
if not caller_file.exists():
raise FileNotFoundError('Caller file could not be correctly retrieved.')
cfg_path_external = caller_file.parent / CONFIG_FILENAME
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(

View File

@ -11,8 +11,8 @@ import numpy.typing as npt
from networkx import DiGraph, Graph
from pandas import DataFrame
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_graphs as logger
from lang_main.shared import load_pickle, save_pickle
# TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False
@ -53,10 +53,10 @@ def get_graph_metadata(
)
if logging:
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
logger.info('Node memory: %.2f KB', (node_mem / 1024))
logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
logger.info('Total memory: %.2f KB', (total_mem / 1024))
return graph_info
@ -342,7 +342,7 @@ class TokenGraph(DiGraph):
saving_path = saving_path.with_suffix('.graphml')
nx.write_graphml(G=target_graph, path=saving_path)
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
def to_pickle(
self,
@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
match path.suffix:
case '.graphml':
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
logger.info(f'Successfully loaded graph from GraphML file {path}.')
logger.info('Successfully loaded graph from GraphML file %s.', path)
case '.pkl' | '.pickle':
graph = typing.cast(Self, load_pickle(path))
logger.info(f'Successfully loaded graph from pickle file {path}.')
logger.info('Successfully loaded graph from pickle file %s.', path)
case _:
raise ValueError('File format not supported.')

View File

@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.types import Embedding, PandasIndex
# ** RE patterns
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple
@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
cleaned entry
"""
# remove special chars
pattern = r'[\t\n\r\f\v]+'
string = re.sub(pattern, ' ', string)
pattern = r'([,;.:!?-_\+]){2,}'
string = pattern_special_chars.sub(' ', string)
string = pattern_repeated_chars.sub(r'\1', string)
# string = pattern_dates.sub('', string)
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = re.sub(pattern, r'\1', string)
string = string.strip()
return string
@ -185,11 +191,9 @@ def entry_wise_cleansing(
# apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func)
logger.info(
(
f'Successfully applied entry-wise cleansing procedure '
f'>>{cleansing_func.__name__}<< '
f'for feature >>{target_feature}<<'
)
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
cleansing_func.__name__,
target_feature,
)
return (data,)
@ -203,7 +207,9 @@ def analyse_feature(
) -> tuple[DataFrame]:
# feature columns
feature_entries = data[target_feature]
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
logger.info(
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
)
# obtain unique entries
unique_feature_entries = feature_entries.unique()
@ -265,7 +271,7 @@ def build_embedding_map(
# check for empty vectors
if not embd.vector_norm:
logger.debug('--- Unknown Words ---')
logger.debug(f'{embd.text=} has no vector')
logger.debug('embd.text: %s has no vector', embd.text)
elif is_STRF:
model = cast(SentenceTransformer, model)
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path)
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
return index_pairs, embds

View File

@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
)
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
logger.info('Non-relevant ObjectIDs removed successfully')
return (data,)

View File

@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
)
from lang_main.loggers import logger_token_analysis as logger
# ** Logging
# LOGGING_LEVEL = 'INFO'
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
# logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
@ -82,10 +77,11 @@ def obtain_relevant_descendants(
continue
logger.debug(
(
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
)
'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
token,
token.pos_,
descendant,
descendant.pos_,
)
# eliminate cases of cross-references with verbs

View File

@ -26,10 +26,10 @@ def create_saving_folder(
else:
logger.info(
(
f'Path >>{saving_path_folder}<< already exists and remained '
f'unchanged. If you want to overwrite this path, use parameter '
f'>>overwrite_existing<<.'
)
'Path >>%s<< already exists and remained unchanged. If you want to '
'overwrite this path, use parameter >>overwrite_existing<<.',
),
saving_path_folder,
)
@ -50,7 +50,7 @@ def save_pickle(
) -> None:
with open(path, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(f'Saved file successfully under {path}')
logger.info('Saved file successfully under %s', path)
def load_pickle(

View File

@ -3,12 +3,13 @@ from typing import Final
from lang_main.types import LoggingLevels
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
# ** logging
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)

View File

@ -3,7 +3,7 @@ from pathlib import Path
from typing import Any
from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import load_pickle, save_pickle
from lang_main.io import load_pickle, save_pickle
# ** pipelines to perform given actions on dataset in a customisable manner
@ -110,13 +110,13 @@ class BasePipeline:
return data
def prep_run(self) -> None:
logger.info(f'Starting processing pipeline >>{self.name}<<...')
logger.info('Starting processing pipeline >>%s<<...', self.name)
# progress tracking
self.curr_proc_idx = 1
# check if performable actions available
if len(self.actions) == 0:
raise NoPerformableActionError(
('The pipeline does not contain any ' 'performable actions.')
'The pipeline does not contain any performable actions.'
)
def run(
@ -139,6 +139,6 @@ class BasePipeline:
# processing tracking
self.curr_proc_idx += 1
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
return ret

View File

@ -1,16 +1,18 @@
from typing import Literal, TypeAlias
import enum
from typing import TypeAlias
import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
LoggingLevels: TypeAlias = Literal[
'DEBUG',
'INFO',
'WARNING',
'ERROR',
'CRITICAL',
]
class LoggingLevels(enum.IntEnum):
DEBUG = 10
INFO = 20
WARNING = 30
ERROR = 40
CRITICAL = 50
PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int

View File

@ -1,5 +1,8 @@
from typing import cast
import time
import webbrowser
from pathlib import Path
from threading import Thread
from typing import cast
import pandas as pd
import plotly.express as px
@ -13,17 +16,20 @@ from dash import (
dcc,
html,
)
from lang_main import load_pickle
from lang_main import CALLER_PATH
from lang_main.io import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_tl = Path(
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
)
# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
# p_tl = Path(
# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
# )
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
return table_data, cols
if __name__ == '__main__':
def _start_webbrowser():
host = '127.0.0.1'
port = '8050'
adress = f'http://{host}:{port}/'
time.sleep(2)
webbrowser.open_new(adress)
def main():
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
webbrowser_thread.start()
app.run(debug=True)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,15 @@
import re
string = """
Hallo mein Name ist Max Mustermann und ich bin am 01.01.2024 geboren.
"""
patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
patt2 = r'[ ]{2,}'
pattern = re.compile(patt)
pattern2 = re.compile(patt2)
res = pattern.sub('', string)
res = pattern2.sub(' ', res)
print(res)