refactoring, improved string cleansing preprocessing

This commit is contained in:
Florian Förster
2024-05-31 09:59:22 +02:00
parent bb987e2108
commit 9cafc9fb97
13 changed files with 111 additions and 98 deletions

View File

@@ -3,11 +3,7 @@ import warnings
from pathlib import Path
from typing import cast
from lang_main import (
TokenGraph,
create_saving_folder,
load_pickle,
)
from lang_main.analysis.graphs import TokenGraph
from lang_main.constants import (
DO_GRAPH_POSTPROCESSING,
DO_PREPROCESSING,
@@ -23,9 +19,7 @@ from lang_main.constants import (
THRESHOLD_AMOUNT_CHARACTERS,
THRESHOLD_EDGE_WEIGHT,
)
# Embedding,
# PandasIndex,
from lang_main.io import create_saving_folder, load_pickle
from lang_main.pipelines.predefined import (
pipe_merge,
pipe_target_feat,
@@ -52,18 +46,9 @@ def run_preprocessing() -> DataFrame:
target_feat_data = ret[0]
# only entries with more than threshold amount of characters
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
# subset_data = target_feat_data.loc[data_filter, 'entry'].copy()
# dupl_idx_pairs, embds = typing.cast(
# tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]],
# pipe_embds.run(starting_values=(subset_data,)),
# )
# merge duplicates, results saved separately
subset_data = target_feat_data.loc[data_filter].copy()
ret = typing.cast(
tuple[DataFrame],
# pipe_merge.run(starting_values=(target_feat_data, dupl_idx_pairs)),
pipe_merge.run(starting_values=(subset_data,)),
)
# merge duplicates, results saved separately
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
preprocessed_data = ret[0]
return preprocessed_data

View File

@@ -3,7 +3,7 @@ from lang_main.constants import SAVE_PATH_FOLDER
print(SAVE_PATH_FOLDER)
txt = """
Wir feiern den Jahrestag, olé!
Wir feiern den Jahrestag am 23.11.2023, olé!
tel:::: !!!!???? +++49 123 456 789
Doch leben wir länger.