refactoring, improved string cleansing preprocessing

This commit is contained in:
Florian Förster 2024-05-31 09:59:22 +02:00
parent bb987e2108
commit 9cafc9fb97
13 changed files with 111 additions and 98 deletions

View File

@ -3,11 +3,7 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import cast from typing import cast
from lang_main import ( from lang_main.analysis.graphs import TokenGraph
TokenGraph,
create_saving_folder,
load_pickle,
)
from lang_main.constants import ( from lang_main.constants import (
DO_GRAPH_POSTPROCESSING, DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING, DO_PREPROCESSING,
@ -23,9 +19,7 @@ from lang_main.constants import (
THRESHOLD_AMOUNT_CHARACTERS, THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT, THRESHOLD_EDGE_WEIGHT,
) )
from lang_main.io import create_saving_folder, load_pickle
# Embedding,
# PandasIndex,
from lang_main.pipelines.predefined import ( from lang_main.pipelines.predefined import (
pipe_merge, pipe_merge,
pipe_target_feat, pipe_target_feat,
@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
target_feat_data = ret[0] target_feat_data = ret[0]
# only entries with more than threshold amount of characters # only entries with more than threshold amount of characters
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS)) data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
# dupl_idx_pairs, embds = typing.cast(
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
# pipe_embds.run(starting_values=(subset_data,)),
# )
# merge duplicates, results saved separately
subset_data = target_feat_data.loc[data_filter].copy() subset_data = target_feat_data.loc[data_filter].copy()
ret = typing.cast( # merge duplicates, results saved separately
tuple[DataFrame], ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
pipe_merge.run(starting_values=(subset_data,)),
)
preprocessed_data = ret[0] preprocessed_data = ret[0]
return preprocessed_data return preprocessed_data

View File

@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
print(SAVE_PATH_FOLDER) print(SAVE_PATH_FOLDER)
txt = """ txt = """
Wir feiern den Jahrestag, olé! Wir feiern den Jahrestag am 23.11.2023, olé!
tel:::: !!!!???? +++49 123 456 789 tel:::: !!!!???? +++49 123 456 789
Doch leben wir länger. Doch leben wir länger.

View File

@ -6,26 +6,14 @@ from pathlib import Path
from time import gmtime from time import gmtime
from typing import Any, Final from typing import Any, Final
from lang_main.analysis.graphs import TokenGraph from lang_main.io import load_toml_config
from lang_main.analysis.preprocessing import Embedding, PandasIndex
from lang_main.shared import (
create_saving_folder,
load_pickle,
load_toml_config,
save_pickle,
)
__all__ = [ __all__ = [
'save_pickle', 'CALLER_PATH',
'load_pickle',
'create_saving_folder',
'Embedding',
'PandasIndex',
'TokenGraph',
] ]
logging.Formatter.converter = gmtime logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s' LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000' LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig( logging.basicConfig(
stream=sys.stdout, stream=sys.stdout,
@ -35,18 +23,18 @@ logging.basicConfig(
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
USE_INTERNAL_CONFIG: Final[bool] = False USE_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent pkg_dir = Path(__file__).parent
cfg_path_internal = pkg_dir / CONFIG_FILENAME cfg_path_internal = pkg_dir / CONFIG_FILENAME
caller_file = Path(inspect.stack()[-1].filename)
CALLER_PATH: Final[Path] = caller_file.parent
# load config data: internal/external # load config data: internal/external
if USE_INTERNAL_CONFIG: if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal) loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else: else:
caller_file = Path(inspect.stack()[-1].filename) cfg_path_external = CALLER_PATH / CONFIG_FILENAME
if not caller_file.exists(): if not caller_file.exists():
raise FileNotFoundError('Caller file could not be correctly retrieved.') raise FileNotFoundError('Caller file could not be correctly retrieved.')
cfg_path_external = caller_file.parent / CONFIG_FILENAME
if not cfg_path_external.exists(): if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external) shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit( sys.exit(

View File

@ -11,8 +11,8 @@ import numpy.typing as npt
from networkx import DiGraph, Graph from networkx import DiGraph, Graph
from pandas import DataFrame from pandas import DataFrame
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_graphs as logger from lang_main.loggers import logger_graphs as logger
from lang_main.shared import load_pickle, save_pickle
# TODO change logging behaviour, add logging to file # TODO change logging behaviour, add logging to file
LOGGING_DEFAULT: Final[bool] = False LOGGING_DEFAULT: Final[bool] = False
@ -53,10 +53,10 @@ def get_graph_metadata(
) )
if logging: if logging:
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges')) logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
logger.info(f'Node memory: {node_mem / 1024:.2f} KB') logger.info('Node memory: %.2f KB', (node_mem / 1024))
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB') logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
logger.info(f'Total memory: {total_mem / 1024:.2f} KB') logger.info('Total memory: %.2f KB', (total_mem / 1024))
return graph_info return graph_info
@ -342,7 +342,7 @@ class TokenGraph(DiGraph):
saving_path = saving_path.with_suffix('.graphml') saving_path = saving_path.with_suffix('.graphml')
nx.write_graphml(G=target_graph, path=saving_path) nx.write_graphml(G=target_graph, path=saving_path)
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.')) logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
def to_pickle( def to_pickle(
self, self,
@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
match path.suffix: match path.suffix:
case '.graphml': case '.graphml':
graph = typing.cast(Self, nx.read_graphml(path, node_type=int)) graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
logger.info(f'Successfully loaded graph from GraphML file {path}.') logger.info('Successfully loaded graph from GraphML file %s.', path)
case '.pkl' | '.pickle': case '.pkl' | '.pickle':
graph = typing.cast(Self, load_pickle(path)) graph = typing.cast(Self, load_pickle(path))
logger.info(f'Successfully loaded graph from pickle file {path}.') logger.info('Successfully loaded graph from pickle file %s.', path)
case _: case _:
raise ValueError('File format not supported.') raise ValueError('File format not supported.')

View File

@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline from lang_main.pipelines.base import BasePipeline
from lang_main.types import Embedding, PandasIndex from lang_main.types import Embedding, PandasIndex
# ** RE patterns
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# ** (1) dataset preparation: loading and simple preprocessing # ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple # following functions used to load a given dataset and perform simple
@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
cleaned entry cleaned entry
""" """
# remove special chars # remove special chars
pattern = r'[\t\n\r\f\v]+' string = pattern_special_chars.sub(' ', string)
string = re.sub(pattern, ' ', string) string = pattern_repeated_chars.sub(r'\1', string)
pattern = r'([,;.:!?-_\+]){2,}' # string = pattern_dates.sub('', string)
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end # remove whitespaces at the beginning and the end
string = re.sub(pattern, r'\1', string)
string = string.strip() string = string.strip()
return string return string
@ -185,11 +191,9 @@ def entry_wise_cleansing(
# apply given cleansing function to target feature # apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func) data[target_feature] = data[target_feature].map(cleansing_func)
logger.info( logger.info(
( ('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
f'Successfully applied entry-wise cleansing procedure ' cleansing_func.__name__,
f'>>{cleansing_func.__name__}<< ' target_feature,
f'for feature >>{target_feature}<<'
)
) )
return (data,) return (data,)
@ -203,7 +207,9 @@ def analyse_feature(
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
# feature columns # feature columns
feature_entries = data[target_feature] feature_entries = data[target_feature]
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}') logger.info(
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
)
# obtain unique entries # obtain unique entries
unique_feature_entries = feature_entries.unique() unique_feature_entries = feature_entries.unique()
@ -265,7 +271,7 @@ def build_embedding_map(
# check for empty vectors # check for empty vectors
if not embd.vector_norm: if not embd.vector_norm:
logger.debug('--- Unknown Words ---') logger.debug('--- Unknown Words ---')
logger.debug(f'{embd.text=} has no vector') logger.debug('embd.text: %s has no vector', embd.text)
elif is_STRF: elif is_STRF:
model = cast(SentenceTransformer, model) model = cast(SentenceTransformer, model)
embd = cast(Tensor, model.encode(text, show_progress_bar=False)) embd = cast(Tensor, model.encode(text, show_progress_bar=False))
@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
logger.info('Saving similarity candidates...') logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename) target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path) df_candidates.to_excel(target_path)
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.') logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
return index_pairs, embds return index_pairs, embds

View File

@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
) )
# only retain entries with ObjectIDs not in IDs to ignore # only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}') logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
logger.info('Non-relevant ObjectIDs removed successfully') logger.info('Non-relevant ObjectIDs removed successfully')
return (data,) return (data,)

View File

@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
) )
from lang_main.loggers import logger_token_analysis as logger from lang_main.loggers import logger_token_analysis as logger
# ** Logging
# LOGGING_LEVEL = 'INFO'
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
# logger = logging.getLogger('ihm_analyse.token_analysis')
# ** POS # ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
@ -82,10 +77,11 @@ def obtain_relevant_descendants(
continue continue
logger.debug( logger.debug(
( 'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant ' token,
f'>>{descendant}<<, POS >>{descendant.pos_}<<' token.pos_,
) descendant,
descendant.pos_,
) )
# eliminate cases of cross-references with verbs # eliminate cases of cross-references with verbs

View File

@ -26,10 +26,10 @@ def create_saving_folder(
else: else:
logger.info( logger.info(
( (
f'Path >>{saving_path_folder}<< already exists and remained ' 'Path >>%s<< already exists and remained unchanged. If you want to '
f'unchanged. If you want to overwrite this path, use parameter ' 'overwrite this path, use parameter >>overwrite_existing<<.',
f'>>overwrite_existing<<.' ),
) saving_path_folder,
) )
@ -50,7 +50,7 @@ def save_pickle(
) -> None: ) -> None:
with open(path, 'wb') as file: with open(path, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
logger.info(f'Saved file successfully under {path}') logger.info('Saved file successfully under %s', path)
def load_pickle( def load_pickle(

View File

@ -3,12 +3,13 @@ from typing import Final
from lang_main.types import LoggingLevels from lang_main.types import LoggingLevels
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO' # ** logging
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO' LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO' LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG' LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO' LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO' LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
logger_shared_helpers = logging.getLogger('lang_main.shared') logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS) logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)

View File

@ -3,7 +3,7 @@ from pathlib import Path
from typing import Any from typing import Any
from lang_main.loggers import logger_pipelines as logger from lang_main.loggers import logger_pipelines as logger
from lang_main.shared import load_pickle, save_pickle from lang_main.io import load_pickle, save_pickle
# ** pipelines to perform given actions on dataset in a customisable manner # ** pipelines to perform given actions on dataset in a customisable manner
@ -110,13 +110,13 @@ class BasePipeline:
return data return data
def prep_run(self) -> None: def prep_run(self) -> None:
logger.info(f'Starting processing pipeline >>{self.name}<<...') logger.info('Starting processing pipeline >>%s<<...', self.name)
# progress tracking # progress tracking
self.curr_proc_idx = 1 self.curr_proc_idx = 1
# check if performable actions available # check if performable actions available
if len(self.actions) == 0: if len(self.actions) == 0:
raise NoPerformableActionError( raise NoPerformableActionError(
('The pipeline does not contain any ' 'performable actions.') 'The pipeline does not contain any performable actions.'
) )
def run( def run(
@ -139,6 +139,6 @@ class BasePipeline:
# processing tracking # processing tracking
self.curr_proc_idx += 1 self.curr_proc_idx += 1
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.') logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
return ret return ret

View File

@ -1,16 +1,18 @@
from typing import Literal, TypeAlias import enum
from typing import TypeAlias
import numpy as np import numpy as np
from spacy.tokens.doc import Doc as SpacyDoc from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor from torch import Tensor
LoggingLevels: TypeAlias = Literal[
'DEBUG', class LoggingLevels(enum.IntEnum):
'INFO', DEBUG = 10
'WARNING', INFO = 20
'ERROR', WARNING = 30
'CRITICAL', ERROR = 40
] CRITICAL = 50
PandasIndex: TypeAlias = int | np.int64 PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int ObjectID: TypeAlias = int

View File

@ -1,5 +1,8 @@
from typing import cast import time
import webbrowser
from pathlib import Path from pathlib import Path
from threading import Thread
from typing import cast
import pandas as pd import pandas as pd
import plotly.express as px import plotly.express as px
@ -13,17 +16,20 @@ from dash import (
dcc, dcc,
html, html,
) )
from lang_main import load_pickle from lang_main import CALLER_PATH
from lang_main.io import load_pickle
from lang_main.types import ObjectID, TimelineCandidates from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame from pandas import DataFrame
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data # ** data
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl') # p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_tl = Path( p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl' # p_tl = Path(
) # r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
# )
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
ret = cast(DataFrame, load_pickle(p_df)) ret = cast(DataFrame, load_pickle(p_df))
data = ret[0] data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl)) ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
return table_data, cols return table_data, cols
if __name__ == '__main__': def _start_webbrowser():
host = '127.0.0.1'
port = '8050'
adress = f'http://{host}:{port}/'
time.sleep(2)
webbrowser.open_new(adress)
def main():
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
webbrowser_thread.start()
app.run(debug=True) app.run(debug=True)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,15 @@
import re
string = """
Hallo mein Name ist Max Mustermann und ich bin am 01.01.2024 geboren.
"""
patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
patt2 = r'[ ]{2,}'
pattern = re.compile(patt)
pattern2 = re.compile(patt2)
res = pattern.sub('', string)
res = pattern2.sub(' ', res)
print(res)