lang-main/src/lang_main/analysis/preprocessing.py
2024-05-31 09:59:22 +02:00

609 lines
20 KiB
Python

import re
from collections.abc import Iterable
from itertools import combinations
from math import factorial
from pathlib import Path
from typing import Callable, cast
import numpy as np
import pandas as pd
import sentence_transformers
import sentence_transformers.util
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
from tqdm import tqdm
from lang_main.analysis.shared import (
candidates_by_index,
similar_index_connection_graph,
similar_index_groups,
)
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import BasePipeline
from lang_main.types import Embedding, PandasIndex
# ** RE patterns
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple
# duplicate cleansing based on all properties
def load_raw_data(
path: Path,
date_cols: Iterable[str] = (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
),
) -> tuple[DataFrame]:
"""load IHM dataset with standard structure
Parameters
----------
path : str
path to dataset file, usually CSV file
date_cols : list[str], optional
columns which contain dates and are parsed as such,
by default (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
)
Returns
-------
DataFrame
raw dataset as DataFrame
"""
# load dataset
date_cols = list(date_cols)
data = pd.read_csv(
filepath_or_buffer=path,
sep=';',
encoding='cp1252',
parse_dates=date_cols,
dayfirst=True,
)
logger.info('Loaded dataset successfully.')
logger.info(
(
f'Dataset properties: number of entries: {len(data)}, '
f'number of features {len(data.columns)}'
)
)
return (data,)
def remove_duplicates(
data: DataFrame,
) -> tuple[DataFrame]:
"""removes duplicated entries over all features in the given dataset
Parameters
----------
data : DataFrame
read data with standard structure
Returns
-------
DataFrame
dataset with removed duplicates over all features
"""
# obtain info about duplicates over all features
duplicates_filt = data.duplicated()
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
# drop duplicates
wo_duplicates = data.drop_duplicates(ignore_index=True)
duplicates_subset: list[str] = [
'VorgangsID',
'ObjektID',
]
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
logger.info(
(
'Number of duplicates over subset '
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
)
)
wo_duplicates = wo_duplicates.drop_duplicates(
subset=duplicates_subset, ignore_index=True
).copy()
logger.info('Removed all duplicates from dataset successfully.')
logger.info(
(
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
f'number of features {len(wo_duplicates.columns)}'
)
)
return (wo_duplicates,)
def remove_NA(
data: DataFrame,
target_features: list[str] = [
'VorgangsBeschreibung',
],
) -> tuple[DataFrame]:
"""function to drop NA entries based on a subset of features to be analysed
Parameters
----------
data : DataFrame
standard IHM dataset, perhaps pre-cleaned
target_features : list[str], optional
subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
Returns
-------
DataFrame
dataset with removed NA entries for given subset of features
"""
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
)
return (wo_NA,)
# ** (2) entry-based cleansing
# following functions clean and prepare specific entries, not whole dataset
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
"""
# remove special chars
string = pattern_special_chars.sub(' ', string)
string = pattern_repeated_chars.sub(r'\1', string)
# string = pattern_dates.sub('', string)
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string
def entry_wise_cleansing(
data: DataFrame,
target_feature: str,
cleansing_func: Callable[[str], str],
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func)
logger.info(
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
cleansing_func.__name__,
target_feature,
)
return (data,)
# ** in-depth analysis of one feature
# following functions try to gain insights on a given feature of the IHM dataset such
# as number of occurrences or associated Object IDs
def analyse_feature(
data: DataFrame,
target_feature: str,
) -> tuple[DataFrame]:
# feature columns
feature_entries = data[target_feature]
logger.info(
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
)
# obtain unique entries
unique_feature_entries = feature_entries.unique()
# prepare result DataFrame
cols = ['entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry)
filt = data[target_feature] == entry
temp = data[filt]
assoc_obj_ids = temp['ObjektID'].unique()
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum()
conc_df = pd.DataFrame(
data=[[entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]],
columns=cols,
)
result_df = pd.concat([result_df, conc_df], ignore_index=True)
result_df = result_df.sort_values(by='num_occur', ascending=False).copy()
return (result_df,)
# ** embedding based similarity
# following functions used to identify similar entries to have
# a more robust identification of duplicates negating negative side effects
# of several disturbances like typos, escape characters, etc.
# build mapping of embeddings for given model
def build_embedding_map(
data: Series,
model: GermanSpacyModel | SentenceTransformer,
) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
# dictionary with embeddings
embeddings: dict[int, tuple[Embedding, str]] = {}
is_spacy = False
is_STRF = False
if isinstance(model, GermanSpacyModel):
is_spacy = True
elif isinstance(model, SentenceTransformer):
is_STRF = True
if not any((is_spacy, is_STRF)):
raise NotImplementedError('Model type unknown')
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
# verbose code: Pyright not inferring types correctly
idx = cast(int, idx)
text = cast(str, text)
if is_spacy:
model = cast(GermanSpacyModel, model)
embd = cast(SpacyDoc, model(text))
embeddings[idx] = (embd, text)
# check for empty vectors
if not embd.vector_norm:
logger.debug('--- Unknown Words ---')
logger.debug('embd.text: %s has no vector', embd.text)
elif is_STRF:
model = cast(SentenceTransformer, model)
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
embeddings[idx] = (embd, text)
return embeddings, (is_spacy, is_STRF)
# adapt interface
# use candidates by index function
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
# build similarity matrix out of embeddings
def build_cosSim_matrix(
data: Series,
model: GermanSpacyModel | SentenceTransformer,
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
# build empty matrix
df_index = data.index
cosineSim_idx_matrix = pd.DataFrame(
data=0.0, columns=df_index, index=df_index, dtype=np.float32
)
logger.info('Start building embedding map...')
# obtain embeddings based on used model
embds, (is_spacy, is_STRF) = build_embedding_map(
data=data,
model=model,
)
logger.info('Embedding map built successfully.')
# apply index based mapping for efficient handling of large texts
combs = combinations(df_index, 2)
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
logger.info('Start calculation of similarity scores...')
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
# print(f"{idx1=}, {idx2=}")
embd1 = embds[idx1][0]
embd2 = embds[idx2][0]
# calculate similarity based on model type
if is_spacy:
embd1 = cast(SpacyDoc, embds[idx1][0])
embd2 = cast(SpacyDoc, embds[idx2][0])
cosSim = embd1.similarity(embd2)
elif is_STRF:
embd1 = cast(Tensor, embds[idx1][0])
embd2 = cast(Tensor, embds[idx2][0])
cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
cosSim = cast(float, cosSim.item())
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
logger.info('Similarity scores calculated successfully.')
return cosineSim_idx_matrix, embds
# obtain index pairs with cosine similarity
# greater than or equal to given threshold value
def filt_thresh_cosSim_matrix(
cosineSim_idx_matrix: DataFrame,
embds: dict[int, tuple[Embedding, str]],
threshold: float,
) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
"""filter similarity matrix by threshold value and return index pairs with
a similarity score greater than the provided threshold
Parameters
----------
threshold : float
similarity threshold
cosineSim_idx_matrix : DataFrame
similarity matrix
Returns
-------
Series
series with multi index (index pairs) and corresponding similarity score
"""
cosineSim_filt = cast(
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
)
return cosineSim_filt, embds
def list_cosSim_dupl_candidates(
cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]],
save_candidates: bool = False,
saving_path: Path | None = None,
filename: str = 'CosSim-FilterCandidates',
pipeline: BasePipeline | None = None,
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
"""providing an overview of candidates with a similarity score greater than
given threshold; more suitable for debugging purposes
Returns
-------
DataFrame
contains indices, corresponding texts and similarity score to evaluate results
list[tuple[Index, Index]]
list containing relevant index pairs for entries with similarity score greater than
given threshold
"""
logger.info('Start gathering of similarity candidates...')
# compare found duplicates
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
# get text content from embedding as second tuple entry
content = [
[
idx1,
embds[idx1][1],
idx2,
embds[idx2][1],
score,
]
]
# add candidates to collection DataFrame
df_conc = pd.DataFrame(columns=columns, data=content)
if df_candidates.empty:
df_candidates = df_conc.copy()
else:
df_candidates = pd.concat([df_candidates, df_conc])
# save index pairs
index_pairs.append((idx1, idx2))
logger.info('Similarity candidates gathered successfully.')
if save_candidates:
if saving_path is None:
raise ValueError(
('Saving path must be provided if duplicate ' 'candidates should be saved.')
)
elif pipeline is not None:
target_filename = (
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
)
elif pipeline is None:
target_filename = f'{filename}.xlsx'
logger.info('Saving similarity candidates...')
target_path = saving_path.joinpath(target_filename)
df_candidates.to_excel(target_path)
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
return index_pairs, embds
# TODO: change implementation fully to SentenceTransformer
# usage of batch processing for embeddings, use candidate idx function
# from time analysis --> moved to ``helpers.py``
"""
def similar_ids_connection_graph(
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[Graph, dict[str, int]]:
# build index graph to obtain graph of connected (similar) indices
# use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset
similar_id_graph = nx.Graph()
for (idx1, idx2) in similar_idx_pairs:
# inplace operation, parent/child do not really exist in undirected graph
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
return similar_id_graph, graph_info
def similar_ids_groups(
dupl_id_graph: Graph,
) -> Iterator[list[PandasIndex]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]],
nx.connected_components(G=dupl_id_graph))
for id_group in ids_groups:
yield list(id_group)
"""
def merge_similarity_dupl(
data: DataFrame,
model: SentenceTransformer,
cos_sim_threshold: float,
) -> tuple[DataFrame]:
logger.info('Start merging of similarity candidates...')
# data
merged_data = data.copy()
model_input = merged_data['entry']
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
by=['num_occur', 'num_assoc_obj_ids', 'len'],
ascending=[False, False, False],
)
# merge information to first entry
data_idx = cast(PandasIndex, similar_data.index[0])
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
assoc_obj_ids = np.concatenate(assoc_obj_ids)
assoc_obj_ids = np.unique(assoc_obj_ids)
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
# remaining indices, should be removed
similar_id_group.remove(data_idx)
merged_similar_data = similar_data.drop(index=similar_id_group)
# update entry in main dataset, drop remaining entries
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info('Similarity candidates merged successfully.')
return (merged_data.copy(),)
# merge duplicates
def merge_similarity_dupl_old(
data: DataFrame,
dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[DataFrame]:
# copy pre-cleaned data
temp = data.copy()
index = temp.index
# logger.info("Start merging of similarity candidates...")
# iterate over index pairs
for i1, i2 in tqdm(dupl_idx_pairs):
# if an entry does not exist any more, skip this pair
if i1 not in index or i2 not in index:
continue
# merge num occur
num_occur1 = temp.at[i1, 'num_occur']
num_occur2 = temp.at[i2, 'num_occur']
new_num_occur = num_occur1 + num_occur2
# merge associated object ids
assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
new_assoc_ids = np.unique(new_assoc_ids.flatten())
# recalculate num associated obj ids
new_num_assoc_obj_ids = len(new_assoc_ids)
# write properties to first entry
temp.at[i1, 'num_occur'] = new_num_occur
temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
# drop second entry
temp = temp.drop(index=i2)
index = temp.index
# logger.info("Similarity candidates merged successfully.")
return (temp,)
# ** debugging and evaluation
def choose_cosSim_dupl_candidates(
cosineSim_filt: Series,
embds: dict[int, tuple[Embedding, str]],
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
"""providing an overview of candidates with a similarity score greater than
given threshold, but decision is made manually by iterating through the candidates
with user interaction; more suitable for debugging purposes
Returns
-------
DataFrame
contains indices, corresponding texts and similarity score to evaluate results
list[tuple[Index, Index]]
list containing relevant index pairs for entries with similarity score greater than
given threshold
"""
# compare found duplicates
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
df_candidates = pd.DataFrame(columns=columns)
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
# get texts for comparison
text1 = embds[idx1][1]
text2 = embds[idx2][1]
# get decision
print('---------- New Decision ----------')
print('text1:\n', text1, '\n', flush=True)
print('text2:\n', text2, '\n', flush=True)
decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
if not decision == 'y':
continue
# get text content from embedding as second tuple entry
content = [
[
idx1,
text1,
idx2,
text2,
score,
]
]
df_conc = pd.DataFrame(columns=columns, data=content)
df_candidates = pd.concat([df_candidates, df_conc])
index_pairs.append((idx1, idx2))
return df_candidates, index_pairs