refactoring, improved string cleansing preprocessing
This commit is contained in:
parent
bb987e2108
commit
9cafc9fb97
@ -3,11 +3,7 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from lang_main import (
|
from lang_main.analysis.graphs import TokenGraph
|
||||||
TokenGraph,
|
|
||||||
create_saving_folder,
|
|
||||||
load_pickle,
|
|
||||||
)
|
|
||||||
from lang_main.constants import (
|
from lang_main.constants import (
|
||||||
DO_GRAPH_POSTPROCESSING,
|
DO_GRAPH_POSTPROCESSING,
|
||||||
DO_PREPROCESSING,
|
DO_PREPROCESSING,
|
||||||
@ -23,9 +19,7 @@ from lang_main.constants import (
|
|||||||
THRESHOLD_AMOUNT_CHARACTERS,
|
THRESHOLD_AMOUNT_CHARACTERS,
|
||||||
THRESHOLD_EDGE_WEIGHT,
|
THRESHOLD_EDGE_WEIGHT,
|
||||||
)
|
)
|
||||||
|
from lang_main.io import create_saving_folder, load_pickle
|
||||||
# Embedding,
|
|
||||||
# PandasIndex,
|
|
||||||
from lang_main.pipelines.predefined import (
|
from lang_main.pipelines.predefined import (
|
||||||
pipe_merge,
|
pipe_merge,
|
||||||
pipe_target_feat,
|
pipe_target_feat,
|
||||||
@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
|
|||||||
target_feat_data = ret[0]
|
target_feat_data = ret[0]
|
||||||
# only entries with more than threshold amount of characters
|
# only entries with more than threshold amount of characters
|
||||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
|
||||||
# dupl_idx_pairs, embds = typing.cast(
|
|
||||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
|
||||||
# pipe_embds.run(starting_values=(subset_data,)),
|
|
||||||
# )
|
|
||||||
# merge duplicates, results saved separately
|
|
||||||
subset_data = target_feat_data.loc[data_filter].copy()
|
subset_data = target_feat_data.loc[data_filter].copy()
|
||||||
ret = typing.cast(
|
# merge duplicates, results saved separately
|
||||||
tuple[DataFrame],
|
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
|
||||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
|
||||||
pipe_merge.run(starting_values=(subset_data,)),
|
|
||||||
)
|
|
||||||
preprocessed_data = ret[0]
|
preprocessed_data = ret[0]
|
||||||
|
|
||||||
return preprocessed_data
|
return preprocessed_data
|
||||||
|
|||||||
@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
|
|||||||
|
|
||||||
print(SAVE_PATH_FOLDER)
|
print(SAVE_PATH_FOLDER)
|
||||||
txt = """
|
txt = """
|
||||||
Wir feiern den Jahrestag, olé!
|
Wir feiern den Jahrestag am 23.11.2023, olé!
|
||||||
tel:::: !!!!???? +++49 123 456 789
|
tel:::: !!!!???? +++49 123 456 789
|
||||||
|
|
||||||
Doch leben wir länger.
|
Doch leben wir länger.
|
||||||
|
|||||||
@ -6,26 +6,14 @@ from pathlib import Path
|
|||||||
from time import gmtime
|
from time import gmtime
|
||||||
from typing import Any, Final
|
from typing import Any, Final
|
||||||
|
|
||||||
from lang_main.analysis.graphs import TokenGraph
|
from lang_main.io import load_toml_config
|
||||||
from lang_main.analysis.preprocessing import Embedding, PandasIndex
|
|
||||||
from lang_main.shared import (
|
|
||||||
create_saving_folder,
|
|
||||||
load_pickle,
|
|
||||||
load_toml_config,
|
|
||||||
save_pickle,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'save_pickle',
|
'CALLER_PATH',
|
||||||
'load_pickle',
|
|
||||||
'create_saving_folder',
|
|
||||||
'Embedding',
|
|
||||||
'PandasIndex',
|
|
||||||
'TokenGraph',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
logging.Formatter.converter = gmtime
|
logging.Formatter.converter = gmtime
|
||||||
LOG_FMT: Final[str] = '%(module)s:%(levelname)s | %(asctime)s | %(message)s'
|
LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
|
||||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stdout,
|
stream=sys.stdout,
|
||||||
@ -35,18 +23,18 @@ logging.basicConfig(
|
|||||||
|
|
||||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||||
|
|
||||||
pkg_dir = Path(__file__).parent
|
pkg_dir = Path(__file__).parent
|
||||||
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
cfg_path_internal = pkg_dir / CONFIG_FILENAME
|
||||||
|
caller_file = Path(inspect.stack()[-1].filename)
|
||||||
|
CALLER_PATH: Final[Path] = caller_file.parent
|
||||||
|
|
||||||
# load config data: internal/external
|
# load config data: internal/external
|
||||||
if USE_INTERNAL_CONFIG:
|
if USE_INTERNAL_CONFIG:
|
||||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||||
else:
|
else:
|
||||||
caller_file = Path(inspect.stack()[-1].filename)
|
cfg_path_external = CALLER_PATH / CONFIG_FILENAME
|
||||||
if not caller_file.exists():
|
if not caller_file.exists():
|
||||||
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
raise FileNotFoundError('Caller file could not be correctly retrieved.')
|
||||||
cfg_path_external = caller_file.parent / CONFIG_FILENAME
|
|
||||||
if not cfg_path_external.exists():
|
if not cfg_path_external.exists():
|
||||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||||
sys.exit(
|
sys.exit(
|
||||||
|
|||||||
@ -11,8 +11,8 @@ import numpy.typing as npt
|
|||||||
from networkx import DiGraph, Graph
|
from networkx import DiGraph, Graph
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
from lang_main.io import load_pickle, save_pickle
|
||||||
from lang_main.loggers import logger_graphs as logger
|
from lang_main.loggers import logger_graphs as logger
|
||||||
from lang_main.shared import load_pickle, save_pickle
|
|
||||||
|
|
||||||
# TODO change logging behaviour, add logging to file
|
# TODO change logging behaviour, add logging to file
|
||||||
LOGGING_DEFAULT: Final[bool] = False
|
LOGGING_DEFAULT: Final[bool] = False
|
||||||
@ -53,10 +53,10 @@ def get_graph_metadata(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if logging:
|
if logging:
|
||||||
logger.info((f'Graph properties: {num_nodes} Nodes, ' f'{num_edges} Edges'))
|
logger.info('Graph properties: %d Nodes, %d Edges', num_nodes, num_edges)
|
||||||
logger.info(f'Node memory: {node_mem / 1024:.2f} KB')
|
logger.info('Node memory: %.2f KB', (node_mem / 1024))
|
||||||
logger.info(f'Edge memory: {edge_mem / 1024:.2f} KB')
|
logger.info('Edge memory: %.2f KB', (edge_mem / 1024))
|
||||||
logger.info(f'Total memory: {total_mem / 1024:.2f} KB')
|
logger.info('Total memory: %.2f KB', (total_mem / 1024))
|
||||||
|
|
||||||
return graph_info
|
return graph_info
|
||||||
|
|
||||||
@ -342,7 +342,7 @@ class TokenGraph(DiGraph):
|
|||||||
|
|
||||||
saving_path = saving_path.with_suffix('.graphml')
|
saving_path = saving_path.with_suffix('.graphml')
|
||||||
nx.write_graphml(G=target_graph, path=saving_path)
|
nx.write_graphml(G=target_graph, path=saving_path)
|
||||||
logger.info(('Successfully saved graph as GraphML file ' f'under {saving_path}.'))
|
logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
|
||||||
|
|
||||||
def to_pickle(
|
def to_pickle(
|
||||||
self,
|
self,
|
||||||
@ -374,10 +374,10 @@ class TokenGraph(DiGraph):
|
|||||||
match path.suffix:
|
match path.suffix:
|
||||||
case '.graphml':
|
case '.graphml':
|
||||||
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
|
||||||
logger.info(f'Successfully loaded graph from GraphML file {path}.')
|
logger.info('Successfully loaded graph from GraphML file %s.', path)
|
||||||
case '.pkl' | '.pickle':
|
case '.pkl' | '.pickle':
|
||||||
graph = typing.cast(Self, load_pickle(path))
|
graph = typing.cast(Self, load_pickle(path))
|
||||||
logger.info(f'Successfully loaded graph from pickle file {path}.')
|
logger.info('Successfully loaded graph from pickle file %s.', path)
|
||||||
case _:
|
case _:
|
||||||
raise ValueError('File format not supported.')
|
raise ValueError('File format not supported.')
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,12 @@ from lang_main.loggers import logger_preprocess as logger
|
|||||||
from lang_main.pipelines.base import BasePipeline
|
from lang_main.pipelines.base import BasePipeline
|
||||||
from lang_main.types import Embedding, PandasIndex
|
from lang_main.types import Embedding, PandasIndex
|
||||||
|
|
||||||
|
# ** RE patterns
|
||||||
|
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
|
||||||
|
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||||
|
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||||
|
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||||
|
|
||||||
|
|
||||||
# ** (1) dataset preparation: loading and simple preprocessing
|
# ** (1) dataset preparation: loading and simple preprocessing
|
||||||
# following functions used to load a given dataset and perform simple
|
# following functions used to load a given dataset and perform simple
|
||||||
@ -167,11 +173,11 @@ def clean_string_slim(string: str) -> str:
|
|||||||
cleaned entry
|
cleaned entry
|
||||||
"""
|
"""
|
||||||
# remove special chars
|
# remove special chars
|
||||||
pattern = r'[\t\n\r\f\v]+'
|
string = pattern_special_chars.sub(' ', string)
|
||||||
string = re.sub(pattern, ' ', string)
|
string = pattern_repeated_chars.sub(r'\1', string)
|
||||||
pattern = r'([,;.:!?-_\+]){2,}'
|
# string = pattern_dates.sub('', string)
|
||||||
|
string = pattern_whitespace.sub(' ', string)
|
||||||
# remove whitespaces at the beginning and the end
|
# remove whitespaces at the beginning and the end
|
||||||
string = re.sub(pattern, r'\1', string)
|
|
||||||
string = string.strip()
|
string = string.strip()
|
||||||
|
|
||||||
return string
|
return string
|
||||||
@ -185,11 +191,9 @@ def entry_wise_cleansing(
|
|||||||
# apply given cleansing function to target feature
|
# apply given cleansing function to target feature
|
||||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||||
logger.info(
|
logger.info(
|
||||||
(
|
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
|
||||||
f'Successfully applied entry-wise cleansing procedure '
|
cleansing_func.__name__,
|
||||||
f'>>{cleansing_func.__name__}<< '
|
target_feature,
|
||||||
f'for feature >>{target_feature}<<'
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|
||||||
@ -203,7 +207,9 @@ def analyse_feature(
|
|||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
# feature columns
|
# feature columns
|
||||||
feature_entries = data[target_feature]
|
feature_entries = data[target_feature]
|
||||||
logger.info(f'Number of entries for feature >>{target_feature}<<: {len(feature_entries)}')
|
logger.info(
|
||||||
|
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
|
||||||
|
)
|
||||||
# obtain unique entries
|
# obtain unique entries
|
||||||
unique_feature_entries = feature_entries.unique()
|
unique_feature_entries = feature_entries.unique()
|
||||||
|
|
||||||
@ -265,7 +271,7 @@ def build_embedding_map(
|
|||||||
# check for empty vectors
|
# check for empty vectors
|
||||||
if not embd.vector_norm:
|
if not embd.vector_norm:
|
||||||
logger.debug('--- Unknown Words ---')
|
logger.debug('--- Unknown Words ---')
|
||||||
logger.debug(f'{embd.text=} has no vector')
|
logger.debug('embd.text: %s has no vector', embd.text)
|
||||||
elif is_STRF:
|
elif is_STRF:
|
||||||
model = cast(SentenceTransformer, model)
|
model = cast(SentenceTransformer, model)
|
||||||
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
||||||
@ -420,7 +426,7 @@ def list_cosSim_dupl_candidates(
|
|||||||
logger.info('Saving similarity candidates...')
|
logger.info('Saving similarity candidates...')
|
||||||
target_path = saving_path.joinpath(target_filename)
|
target_path = saving_path.joinpath(target_filename)
|
||||||
df_candidates.to_excel(target_path)
|
df_candidates.to_excel(target_path)
|
||||||
logger.info(f'Similarity candidates saved successfully to >>{target_path}<<.')
|
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
|
||||||
|
|
||||||
return index_pairs, embds
|
return index_pairs, embds
|
||||||
|
|
||||||
|
|||||||
@ -60,7 +60,7 @@ def remove_non_relevant_obj_ids(
|
|||||||
)
|
)
|
||||||
# only retain entries with ObjectIDs not in IDs to ignore
|
# only retain entries with ObjectIDs not in IDs to ignore
|
||||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||||
logger.debug(f'Ignored ObjectIDs: {ids_to_ignore}')
|
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
||||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||||
|
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|||||||
@ -16,11 +16,6 @@ from lang_main.analysis.graphs import (
|
|||||||
)
|
)
|
||||||
from lang_main.loggers import logger_token_analysis as logger
|
from lang_main.loggers import logger_token_analysis as logger
|
||||||
|
|
||||||
# ** Logging
|
|
||||||
# LOGGING_LEVEL = 'INFO'
|
|
||||||
# logging.basicConfig(level=LOGGING_LEVEL, stream=sys.stdout)
|
|
||||||
# logger = logging.getLogger('ihm_analyse.token_analysis')
|
|
||||||
|
|
||||||
# ** POS
|
# ** POS
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||||
@ -82,10 +77,11 @@ def obtain_relevant_descendants(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
(
|
'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
|
||||||
f'Token >>{token}<<, POS >>{token.pos_}<< | descendant '
|
token,
|
||||||
f'>>{descendant}<<, POS >>{descendant.pos_}<<'
|
token.pos_,
|
||||||
)
|
descendant,
|
||||||
|
descendant.pos_,
|
||||||
)
|
)
|
||||||
|
|
||||||
# eliminate cases of cross-references with verbs
|
# eliminate cases of cross-references with verbs
|
||||||
|
|||||||
@ -26,10 +26,10 @@ def create_saving_folder(
|
|||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
(
|
(
|
||||||
f'Path >>{saving_path_folder}<< already exists and remained '
|
'Path >>%s<< already exists and remained unchanged. If you want to '
|
||||||
f'unchanged. If you want to overwrite this path, use parameter '
|
'overwrite this path, use parameter >>overwrite_existing<<.',
|
||||||
f'>>overwrite_existing<<.'
|
),
|
||||||
)
|
saving_path_folder,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ def save_pickle(
|
|||||||
) -> None:
|
) -> None:
|
||||||
with open(path, 'wb') as file:
|
with open(path, 'wb') as file:
|
||||||
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
logger.info(f'Saved file successfully under {path}')
|
logger.info('Saved file successfully under %s', path)
|
||||||
|
|
||||||
|
|
||||||
def load_pickle(
|
def load_pickle(
|
||||||
@ -3,12 +3,13 @@ from typing import Final
|
|||||||
|
|
||||||
from lang_main.types import LoggingLevels
|
from lang_main.types import LoggingLevels
|
||||||
|
|
||||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = 'INFO'
|
# ** logging
|
||||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = 'INFO'
|
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = 'INFO'
|
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
|
||||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = 'DEBUG'
|
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = 'INFO'
|
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = 'INFO'
|
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||||
|
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||||
|
|
||||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||||
|
|||||||
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from lang_main.loggers import logger_pipelines as logger
|
from lang_main.loggers import logger_pipelines as logger
|
||||||
from lang_main.shared import load_pickle, save_pickle
|
from lang_main.io import load_pickle, save_pickle
|
||||||
|
|
||||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||||
|
|
||||||
@ -110,13 +110,13 @@ class BasePipeline:
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def prep_run(self) -> None:
|
def prep_run(self) -> None:
|
||||||
logger.info(f'Starting processing pipeline >>{self.name}<<...')
|
logger.info('Starting processing pipeline >>%s<<...', self.name)
|
||||||
# progress tracking
|
# progress tracking
|
||||||
self.curr_proc_idx = 1
|
self.curr_proc_idx = 1
|
||||||
# check if performable actions available
|
# check if performable actions available
|
||||||
if len(self.actions) == 0:
|
if len(self.actions) == 0:
|
||||||
raise NoPerformableActionError(
|
raise NoPerformableActionError(
|
||||||
('The pipeline does not contain any ' 'performable actions.')
|
'The pipeline does not contain any performable actions.'
|
||||||
)
|
)
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
@ -139,6 +139,6 @@ class BasePipeline:
|
|||||||
# processing tracking
|
# processing tracking
|
||||||
self.curr_proc_idx += 1
|
self.curr_proc_idx += 1
|
||||||
|
|
||||||
logger.info(f'Processing pipeline >>{self.name}<< successfully ended.')
|
logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|||||||
@ -1,16 +1,18 @@
|
|||||||
from typing import Literal, TypeAlias
|
import enum
|
||||||
|
from typing import TypeAlias
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
LoggingLevels: TypeAlias = Literal[
|
|
||||||
'DEBUG',
|
class LoggingLevels(enum.IntEnum):
|
||||||
'INFO',
|
DEBUG = 10
|
||||||
'WARNING',
|
INFO = 20
|
||||||
'ERROR',
|
WARNING = 30
|
||||||
'CRITICAL',
|
ERROR = 40
|
||||||
]
|
CRITICAL = 50
|
||||||
|
|
||||||
|
|
||||||
PandasIndex: TypeAlias = int | np.int64
|
PandasIndex: TypeAlias = int | np.int64
|
||||||
ObjectID: TypeAlias = int
|
ObjectID: TypeAlias = int
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
from typing import cast
|
import time
|
||||||
|
import webbrowser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from threading import Thread
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
@ -13,17 +16,20 @@ from dash import (
|
|||||||
dcc,
|
dcc,
|
||||||
html,
|
html,
|
||||||
)
|
)
|
||||||
from lang_main import load_pickle
|
from lang_main import CALLER_PATH
|
||||||
|
from lang_main.io import load_pickle
|
||||||
from lang_main.types import ObjectID, TimelineCandidates
|
from lang_main.types import ObjectID, TimelineCandidates
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
|
||||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||||
|
|
||||||
# ** data
|
# ** data
|
||||||
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
# p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||||
p_tl = Path(
|
p_df = CALLER_PATH.joinpath('./Pipe-TargetFeature_Step-3_remove_NA.pkl')
|
||||||
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
# p_tl = Path(
|
||||||
)
|
# r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
|
||||||
|
# )
|
||||||
|
p_tl = CALLER_PATH.joinpath('./Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl')
|
||||||
ret = cast(DataFrame, load_pickle(p_df))
|
ret = cast(DataFrame, load_pickle(p_df))
|
||||||
data = ret[0]
|
data = ret[0]
|
||||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||||
@ -171,5 +177,19 @@ def update_table_candidates(index, obj_id):
|
|||||||
return table_data, cols
|
return table_data, cols
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def _start_webbrowser():
|
||||||
|
host = '127.0.0.1'
|
||||||
|
port = '8050'
|
||||||
|
adress = f'http://{host}:{port}/'
|
||||||
|
time.sleep(2)
|
||||||
|
webbrowser.open_new(adress)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
|
||||||
|
webbrowser_thread.start()
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|||||||
15
tests/pre_test_examples.py
Normal file
15
tests/pre_test_examples.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
string = """
|
||||||
|
Hallo mein Name ist Max Mustermann und ich bin am 01.01.2024 geboren.
|
||||||
|
"""
|
||||||
|
|
||||||
|
patt = r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?'
|
||||||
|
patt2 = r'[ ]{2,}'
|
||||||
|
pattern = re.compile(patt)
|
||||||
|
pattern2 = re.compile(patt2)
|
||||||
|
res = pattern.sub('', string)
|
||||||
|
res = pattern2.sub(' ', res)
|
||||||
|
|
||||||
|
print(res)
|
||||||
Loading…
x
Reference in New Issue
Block a user