refactoring, improved string cleansing preprocessing
This commit is contained in:
parent
bb987e2108
commit
9cafc9fb97
@ -3,11 +3,7 @@ import warnings
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from lang_main import (
|
||||
TokenGraph,
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
)
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
from lang_main.constants import (
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
DO_PREPROCESSING,
|
||||
@ -23,9 +19,7 @@ from lang_main.constants import (
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
)
|
||||
|
||||
# Embedding,
|
||||
# PandasIndex,
|
||||
from lang_main.io import create_saving_folder, load_pickle
|
||||
from lang_main.pipelines.predefined import (
|
||||
pipe_merge,
|
||||
pipe_target_feat,
|
||||
@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
|
||||
target_feat_data = ret[0]
|
||||
# only entries with more than threshold amount of characters
|
||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
# dupl_idx_pairs, embds = typing.cast(
|
||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||
# pipe_embds.run(starting_values=(subset_data,)),
|
||||
# )
|
||||
# merge duplicates, results saved separately
|
||||
subset_data = target_feat_data.loc[data_filter].copy()
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame],
|
||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||
pipe_merge.run(starting_values=(subset_data,)),
|
||||
)
|
||||
# merge duplicates, results saved separately
|
||||
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
return preprocessed_data
|
||||
|
||||
@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
|
||||
|
||||
print(SAVE_PATH_FOLDER)
|
||||
txt = """
|
||||
Wir feiern den Jahrestag, olé!
|
||||
Wir feiern den Jahrestag am 23.11.2023, olé!
|
||||
tel:::: !!!!???? +++49 123 456 789
|
||||
|
||||
Doch leben wir länger.
|
||||
|
||||
@ -6,26 +6,14 @@ from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Any, Final
|
||||
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
||||
from lang_main.shared import (
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
load_toml_config,
|
||||
save_pickle,
|
||||
)
|
||||
from lang_main.io import load_toml_config
|
||||
|
||||
__all__ = [
|
||||
'save_pickle',
|
||||
'load_pickle',
|
||||
'create_saving_folder',
|
||||
'Embedding',
|
||||
'PandasIndex',
|
||||
'TokenGraph',
|
||||
'CALLER_PATH',
|
||||
]
|
||||
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
|
||||
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout,
|
||||
@ -35,18 +23,18 @@ logging.basicConfig(
|
||||
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||
|
||||
pkg_dir = Path(__file__).parent
|
||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||
caller_file = Path(inspect.stack()[-1].filename)
|
||||
CALLER_PATH: Final[Path] = caller_file.parent
|
||||
|
||||
# load config data: internal/external
|
||||
if USE_INTERNAL_CONFIG:
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||
else:
|
||||
caller_file = Path(inspect.stack()[-1].filename)
|
||||
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
|
||||
if not caller_file.exists():
|
||||
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
||||
cfg_path_external = caller_file.parent / CONFIG_FILENAME
|
||||
if not cfg_path_external.exists():
|
||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||
sys.exit(
|
||||
|
||||
@ -11,8 +11,8 @@ import numpy.typing as npt
|
||||
from networkx import DiGraph, Graph
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.io import load_pickle, save_pickle
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
|
||||
# TODO change logging behaviour, add logging to file
|
||||
LOGGING_DEFAULT: Final[bool] = False
|
||||
@ -53,10 +53,10 @@ def get_graph_metadata(
|
||||
)
|
||||
|
||||
if logging:
|
||||
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
|
||||
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
|
||||
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
|
||||
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
|
||||
logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
|
||||
logger.info('Node memory: %.2f KB', (node_mem / 1024))
|
||||
logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
|
||||
logger.info('Total memory: %.2f KB', (total_mem / 1024))
|
||||
|
||||
return graph_info
|
||||
|
||||
@ -342,7 +342,7 @@ class TokenGraph(DiGraph):
|
||||
|
||||
saving_path = saving_path.with_suffix('.graphml')
|
||||
nx.write_graphml(G=target_graph, path=saving_path)
|
||||
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
|
||||
logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
|
||||
|
||||
def to_pickle(
|
||||
self,
|
||||
@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
|
||||
match path.suffix:
|
||||
case '.graphml':
|
||||
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
||||
logger.info(f'Successfully loaded graph from GraphML file {path}.')
|
||||
logger.info('Successfully loaded graph from GraphML file %s.', path)
|
||||
case '.pkl' | '.pickle':
|
||||
graph = typing.cast(Self, load_pickle(path))
|
||||
logger.info(f'Successfully loaded graph from pickle file {path}.')
|
||||
logger.info('Successfully loaded graph from pickle file %s.', path)
|
||||
case _:
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
|
||||
@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import BasePipeline
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
|
||||
# ** RE patterns
|
||||
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
|
||||
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
# following functions used to load a given dataset and perform simple
|
||||
@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
pattern = r'[\t\n\r\f\v]+'
|
||||
string = re.sub(pattern, ' ', string)
|
||||
pattern = r'([,;.:!?-_\+]){2,}'
|
||||
string = pattern_special_chars.sub(' ', string)
|
||||
string = pattern_repeated_chars.sub(r'\1', string)
|
||||
# string = pattern_dates.sub('', string)
|
||||
string = pattern_whitespace.sub(' ', string)
|
||||
# remove whitespaces at the beginning and the end
|
||||
string = re.sub(pattern, r'\1', string)
|
||||
string = string.strip()
|
||||
|
||||
return string
|
||||
@ -185,11 +191,9 @@ def entry_wise_cleansing(
|
||||
# apply given cleansing function to target feature
|
||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||
logger.info(
|
||||
(
|
||||
f'Successfully applied entry-wise cleansing procedure '
|
||||
f'>>{cleansing_func.__name__}<< '
|
||||
f'for feature >>{target_feature}<<'
|
||||
)
|
||||
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
|
||||
cleansing_func.__name__,
|
||||
target_feature,
|
||||
)
|
||||
return (data,)
|
||||
|
||||
@ -203,7 +207,9 @@ def analyse_feature(
|
||||
) -> tuple[DataFrame]:
|
||||
# feature columns
|
||||
feature_entries = data[target_feature]
|
||||
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
|
||||
logger.info(
|
||||
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
|
||||
)
|
||||
# obtain unique entries
|
||||
unique_feature_entries = feature_entries.unique()
|
||||
|
||||
@ -265,7 +271,7 @@ def build_embedding_map(
|
||||
# check for empty vectors
|
||||
if not embd.vector_norm:
|
||||
logger.debug('--- Unknown Words ---')
|
||||
logger.debug(f'{embd.text=} has no vector')
|
||||
logger.debug('embd.text: %s has no vector', embd.text)
|
||||
elif is_STRF:
|
||||
model = cast(SentenceTransformer, model)
|
||||
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||
@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
|
||||
logger.info('Saving similarity candidates...')
|
||||
target_path = saving_path.joinpath(target_filename)
|
||||
df_candidates.to_excel(target_path)
|
||||
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
|
||||
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
|
||||
|
||||
return index_pairs, embds
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
|
||||
)
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
|
||||
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||
|
||||
return (data,)
|
||||
|
||||
@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
|
||||
)
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
|
||||
# ** Logging
|
||||
# LOGGING_LEVEL = 'INFO'
|
||||
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
||||
# logger = logging.getLogger('ihm_analyse.token_analysis')
|
||||
|
||||
# ** POS
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
@ -82,10 +77,11 @@ def obtain_relevant_descendants(
|
||||
continue
|
||||
|
||||
logger.debug(
|
||||
(
|
||||
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
|
||||
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
|
||||
)
|
||||
'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
|
||||
token,
|
||||
token.pos_,
|
||||
descendant,
|
||||
descendant.pos_,
|
||||
)
|
||||
|
||||
# eliminate cases of cross-references with verbs
|
||||
|
||||
@ -26,10 +26,10 @@ def create_saving_folder(
|
||||
else:
|
||||
logger.info(
|
||||
(
|
||||
f'Path >>{saving_path_folder}<< already exists and remained '
|
||||
f'unchanged. If you want to overwrite this path, use parameter '
|
||||
f'>>overwrite_existing<<.'
|
||||
)
|
||||
'Path >>%s<< already exists and remained unchanged. If you want to '
|
||||
'overwrite this path, use parameter >>overwrite_existing<<.',
|
||||
),
|
||||
saving_path_folder,
|
||||
)
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ def save_pickle(
|
||||
) -> None:
|
||||
with open(path, 'wb') as file:
|
||||
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(f'Saved file successfully under {path}')
|
||||
logger.info('Saved file successfully under %s', path)
|
||||
|
||||
|
||||
def load_pickle(
|
||||
@ -3,12 +3,13 @@ from typing import Final
|
||||
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
|
||||
# ** logging
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
|
||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||
|
||||
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.shared import load_pickle, save_pickle
|
||||
from lang_main.io import load_pickle, save_pickle
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
@ -110,13 +110,13 @@ class BasePipeline:
|
||||
return data
|
||||
|
||||
def prep_run(self) -> None:
|
||||
logger.info(f'Starting processing pipeline >>{self.name}<<...')
|
||||
logger.info('Starting processing pipeline >>%s<<...', self.name)
|
||||
# progress tracking
|
||||
self.curr_proc_idx = 1
|
||||
# check if performable actions available
|
||||
if len(self.actions) == 0:
|
||||
raise NoPerformableActionError(
|
||||
('The pipeline does not contain any ' 'performable actions.')
|
||||
'The pipeline does not contain any performable actions.'
|
||||
)
|
||||
|
||||
def run(
|
||||
@ -139,6 +139,6 @@ class BasePipeline:
|
||||
# processing tracking
|
||||
self.curr_proc_idx += 1
|
||||
|
||||
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
|
||||
logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
|
||||
|
||||
return ret
|
||||
|
||||
@ -1,16 +1,18 @@
|
||||
from typing import Literal, TypeAlias
|
||||
import enum
|
||||
from typing import TypeAlias
|
||||
|
||||
import numpy as np
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
|
||||
LoggingLevels: TypeAlias = Literal[
|
||||
'DEBUG',
|
||||
'INFO',
|
||||
'WARNING',
|
||||
'ERROR',
|
||||
'CRITICAL',
|
||||
]
|
||||
|
||||
class LoggingLevels(enum.IntEnum):
|
||||
DEBUG = 10
|
||||
INFO = 20
|
||||
WARNING = 30
|
||||
ERROR = 40
|
||||
CRITICAL = 50
|
||||
|
||||
|
||||
PandasIndex: TypeAlias = int | np.int64
|
||||
ObjectID: TypeAlias = int
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
from typing import cast
|
||||
import time
|
||||
import webbrowser
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import cast
|
||||
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
@ -13,17 +16,20 @@ from dash import (
|
||||
dcc,
|
||||
html,
|
||||
)
|
||||
from lang_main import load_pickle
|
||||
from lang_main import CALLER_PATH
|
||||
from lang_main.io import load_pickle
|
||||
from lang_main.types import ObjectID, TimelineCandidates
|
||||
from pandas import DataFrame
|
||||
|
||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||
|
||||
# ** data
|
||||
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||
p_tl = Path(
|
||||
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||
)
|
||||
# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||
p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||
# p_tl = Path(
|
||||
# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||
# )
|
||||
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
|
||||
ret = cast(DataFrame, load_pickle(p_df))
|
||||
data = ret[0]
|
||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||
@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
|
||||
return table_data, cols
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def _start_webbrowser():
|
||||
host = '127.0.0.1'
|
||||
port = '8050'
|
||||
adress = f'http://{host}:{port}/'
|
||||
time.sleep(2)
|
||||
webbrowser.open_new(adress)
|
||||
|
||||
|
||||
def main():
|
||||
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
|
||||
webbrowser_thread.start()
|
||||
app.run(debug=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
15
tests/pre_test_examples.py
Normal file
15
tests/pre_test_examples.py
Normal file
@ -0,0 +1,15 @@
|
||||
import re
|
||||
|
||||
|
||||
string = """
|
||||
Hallo mein Name ist Max Mustermann und ich bin am 01.01.2024 geboren.
|
||||
"""
|
||||
|
||||
patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
|
||||
patt2 = r'[ ]{2,}'
|
||||
pattern = re.compile(patt)
|
||||
pattern2 = re.compile(patt2)
|
||||
res = pattern.sub('', string)
|
||||
res = pattern2.sub(' ', res)
|
||||
|
||||
print(res)
|
||||
Loading…
x
Reference in New Issue
Block a user