refactoring, improved string cleansing preprocessing
This commit is contained in:
@@ -3,11 +3,7 @@ import warnings
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from lang_main import (
|
||||
TokenGraph,
|
||||
create_saving_folder,
|
||||
load_pickle,
|
||||
)
|
||||
from lang_main.analysis.graphs import TokenGraph
|
||||
from lang_main.constants import (
|
||||
DO_GRAPH_POSTPROCESSING,
|
||||
DO_PREPROCESSING,
|
||||
@@ -23,9 +19,7 @@ from lang_main.constants import (
|
||||
THRESHOLD_AMOUNT_CHARACTERS,
|
||||
THRESHOLD_EDGE_WEIGHT,
|
||||
)
|
||||
|
||||
# Embedding,
|
||||
# PandasIndex,
|
||||
from lang_main.io import create_saving_folder, load_pickle
|
||||
from lang_main.pipelines.predefined import (
|
||||
pipe_merge,
|
||||
pipe_target_feat,
|
||||
@@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
|
||||
target_feat_data = ret[0]
|
||||
# only entries with more than threshold amount of characters
|
||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
||||
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
|
||||
# dupl_idx_pairs, embds = typing.cast(
|
||||
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
|
||||
# pipe_embds.run(starting_values=(subset_data,)),
|
||||
# )
|
||||
# merge duplicates, results saved separately
|
||||
subset_data = target_feat_data.loc[data_filter].copy()
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame],
|
||||
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
|
||||
pipe_merge.run(starting_values=(subset_data,)),
|
||||
)
|
||||
# merge duplicates, results saved separately
|
||||
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
|
||||
preprocessed_data = ret[0]
|
||||
|
||||
return preprocessed_data
|
||||
|
||||
@@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
|
||||
|
||||
print(SAVE_PATH_FOLDER)
|
||||
txt = """
|
||||
Wir feiern den Jahrestag, olé!
|
||||
Wir feiern den Jahrestag am 23.11.2023, olé!
|
||||
tel:::: !!!!???? +++49 123 456 789
|
||||
|
||||
Doch leben wir länger.
|
||||
|
||||
Reference in New Issue
Block a user