599 lines
20 KiB
Python
599 lines
20 KiB
Python
from collections.abc import Collection
|
|
from itertools import combinations
|
|
from math import factorial
|
|
from pathlib import Path
|
|
from typing import cast
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sentence_transformers
|
|
import sentence_transformers.util
|
|
from pandas import DataFrame, Series
|
|
from sentence_transformers import SentenceTransformer
|
|
from spacy.lang.de import German as GermanSpacyModel
|
|
from spacy.tokens.doc import Doc as SpacyDoc
|
|
from torch import Tensor
|
|
from tqdm import tqdm
|
|
|
|
from lang_main.analysis.shared import (
|
|
candidates_by_index,
|
|
similar_index_connection_graph,
|
|
similar_index_groups,
|
|
)
|
|
from lang_main.loggers import logger_preprocess as logger
|
|
from lang_main.pipelines.base import Pipeline
|
|
from lang_main.types import Embedding, PandasIndex
|
|
|
|
# TODO removal
|
|
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
|
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
|
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
|
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
|
# pattern_whitespace = re.compile(r'[ ]{2,}')
|
|
|
|
|
|
# ** (1) dataset preparation: loading and simple preprocessing
|
|
# following functions used to load a given dataset and perform simple
|
|
# duplicate cleansing based on all properties
|
|
def load_raw_data(
|
|
path: Path,
|
|
date_cols: Collection[str] = (
|
|
'VorgangsDatum',
|
|
'ErledigungsDatum',
|
|
'Arbeitsbeginn',
|
|
'ErstellungsDatum',
|
|
),
|
|
) -> tuple[DataFrame]:
|
|
"""load IHM dataset with standard structure
|
|
|
|
Parameters
|
|
----------
|
|
path : str
|
|
path to dataset file, usually CSV file
|
|
date_cols : Collection[str], optional
|
|
columns which contain dates and are parsed as such,
|
|
by default (
|
|
'VorgangsDatum',
|
|
'ErledigungsDatum',
|
|
'Arbeitsbeginn',
|
|
'ErstellungsDatum',
|
|
)
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
raw dataset as DataFrame
|
|
"""
|
|
# load dataset
|
|
date_cols = list(date_cols)
|
|
data = pd.read_csv(
|
|
filepath_or_buffer=path,
|
|
sep=';',
|
|
encoding='cp1252',
|
|
parse_dates=list(date_cols),
|
|
dayfirst=True,
|
|
)
|
|
logger.info('Loaded dataset successfully.')
|
|
logger.info(
|
|
(
|
|
f'Dataset properties: number of entries: {len(data)}, '
|
|
f'number of features {len(data.columns)}'
|
|
)
|
|
)
|
|
return (data,)
|
|
|
|
|
|
def remove_duplicates(
|
|
data: DataFrame,
|
|
) -> tuple[DataFrame]:
|
|
"""removes duplicated entries over all features in the given dataset
|
|
|
|
Parameters
|
|
----------
|
|
data : DataFrame
|
|
read data with standard structure
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
dataset with removed duplicates over all features
|
|
"""
|
|
# obtain info about duplicates over all features
|
|
duplicates_filt = data.duplicated()
|
|
logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
|
|
# drop duplicates
|
|
wo_duplicates = data.drop_duplicates(ignore_index=True)
|
|
duplicates_subset: list[str] = [
|
|
'VorgangsID',
|
|
'ObjektID',
|
|
]
|
|
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
|
|
logger.info(
|
|
(
|
|
'Number of duplicates over subset '
|
|
f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
|
|
)
|
|
)
|
|
wo_duplicates = wo_duplicates.drop_duplicates(
|
|
subset=duplicates_subset, ignore_index=True
|
|
).copy()
|
|
logger.info('Removed all duplicates from dataset successfully.')
|
|
logger.info(
|
|
'New Dataset properties: number of entries: %d, number of features %d',
|
|
len(wo_duplicates),
|
|
len(wo_duplicates.columns),
|
|
)
|
|
|
|
return (wo_duplicates,)
|
|
|
|
|
|
def remove_NA(
|
|
data: DataFrame,
|
|
target_features: Collection[str] = ('VorgangsBeschreibung',),
|
|
) -> tuple[DataFrame]:
|
|
"""function to drop NA entries based on a subset of features to be analysed
|
|
|
|
Parameters
|
|
----------
|
|
data : DataFrame
|
|
standard IHM dataset, perhaps pre-cleaned
|
|
target_features : Collection[str], optional
|
|
subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
dataset with removed NA entries for given subset of features
|
|
"""
|
|
target_features = list(target_features)
|
|
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
|
logger.info(
|
|
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
|
)
|
|
|
|
return (wo_NA,)
|
|
|
|
|
|
# ** (2) entry-based cleansing
|
|
# ** moved to module ``lang_main.analysis.shared``
|
|
|
|
|
|
# ** in-depth analysis of one feature
|
|
# following functions try to gain insights on a given feature of the IHM dataset such
|
|
# as number of occurrences or associated Object IDs
|
|
def analyse_feature(
|
|
data: DataFrame,
|
|
target_feature: str,
|
|
) -> tuple[DataFrame]:
|
|
# feature columns
|
|
feature_entries = data[target_feature]
|
|
logger.info(
|
|
'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
|
|
)
|
|
# obtain unique entries
|
|
unique_feature_entries = feature_entries.unique()
|
|
|
|
# prepare result DataFrame
|
|
cols = ['batched_idxs', 'entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
|
|
result_df = pd.DataFrame(columns=cols)
|
|
|
|
for entry in tqdm(unique_feature_entries, mininterval=1.0):
|
|
len_entry = len(entry)
|
|
filt = data[target_feature] == entry
|
|
temp = data[filt]
|
|
batched_idxs = temp.index.to_numpy()
|
|
assoc_obj_ids = temp['ObjektID'].unique()
|
|
assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
|
|
num_assoc_obj_ids = len(assoc_obj_ids)
|
|
num_dupl = filt.sum()
|
|
|
|
conc_df = pd.DataFrame(
|
|
data=[
|
|
[batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
|
|
],
|
|
columns=cols,
|
|
)
|
|
|
|
result_df = pd.concat([result_df, conc_df], ignore_index=True)
|
|
|
|
result_df = result_df.sort_values(by='num_occur', ascending=False).copy()
|
|
|
|
return (result_df,)
|
|
|
|
|
|
# ** pre-filter
|
|
def numeric_pre_filter_feature(
|
|
data: DataFrame,
|
|
feature: str,
|
|
bound_lower: int | None,
|
|
bound_upper: int | None,
|
|
) -> tuple[DataFrame]:
|
|
if not any([bound_lower, bound_upper]):
|
|
raise ValueError('No bounds for filtering provided')
|
|
|
|
data = data.copy()
|
|
if bound_lower is None:
|
|
bound_lower = cast(int, data[feature].min())
|
|
if bound_upper is None:
|
|
bound_upper = cast(int, data[feature].max())
|
|
|
|
filter_lower = data[feature] >= bound_lower
|
|
filter_upper = data[feature] <= bound_upper
|
|
filter = filter_lower & filter_upper
|
|
|
|
data = data.loc[filter]
|
|
|
|
return (data,)
|
|
|
|
|
|
# ** embedding based similarity
|
|
# following functions used to identify similar entries to have
|
|
# a more robust identification of duplicates negating negative side effects
|
|
# of several disturbances like typos, escape characters, etc.
|
|
# build mapping of embeddings for given model
|
|
def merge_similarity_dupl(
|
|
data: DataFrame,
|
|
model: SentenceTransformer,
|
|
cos_sim_threshold: float,
|
|
) -> tuple[DataFrame]:
|
|
logger.info('Start merging of similarity candidates...')
|
|
|
|
# data
|
|
merged_data = data.copy()
|
|
model_input = merged_data['entry']
|
|
candidates_idx = candidates_by_index(
|
|
data_model_input=model_input,
|
|
model=model,
|
|
cos_sim_threshold=cos_sim_threshold,
|
|
)
|
|
# graph of similar ids
|
|
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
|
|
|
for similar_id_group in similar_index_groups(similar_id_graph):
|
|
similar_id_group = list(similar_id_group)
|
|
similar_data = merged_data.loc[similar_id_group, :]
|
|
# keep first entry with max number occurrences, then number of
|
|
# associated objects, then length of entry
|
|
similar_data = similar_data.sort_values(
|
|
by=['num_occur', 'num_assoc_obj_ids', 'len'],
|
|
ascending=[False, False, False],
|
|
)
|
|
# merge information to first entry
|
|
data_idx = cast(PandasIndex, similar_data.index[0])
|
|
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
|
|
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
|
|
assoc_obj_ids = np.concatenate(assoc_obj_ids)
|
|
assoc_obj_ids = np.unique(assoc_obj_ids)
|
|
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
|
|
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
|
|
# remaining indices, should be removed
|
|
similar_id_group.remove(data_idx)
|
|
merged_similar_data = similar_data.drop(index=similar_id_group)
|
|
# update entry in main dataset, drop remaining entries
|
|
merged_data.update(merged_similar_data)
|
|
merged_data = merged_data.drop(index=similar_id_group)
|
|
|
|
logger.info('Similarity candidates merged successfully.')
|
|
|
|
return (merged_data,)
|
|
|
|
|
|
# ** #################################################################################
|
|
# TODO check removal
|
|
def build_embedding_map(
|
|
data: Series,
|
|
model: GermanSpacyModel | SentenceTransformer,
|
|
) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
|
|
# dictionary with embeddings
|
|
embeddings: dict[int, tuple[Embedding, str]] = {}
|
|
is_spacy = False
|
|
is_STRF = False
|
|
|
|
if isinstance(model, GermanSpacyModel):
|
|
is_spacy = True
|
|
elif isinstance(model, SentenceTransformer):
|
|
is_STRF = True
|
|
|
|
if not any((is_spacy, is_STRF)):
|
|
raise NotImplementedError('Model type unknown')
|
|
|
|
for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
|
|
# verbose code: Pyright not inferring types correctly
|
|
idx = cast(int, idx)
|
|
text = cast(str, text)
|
|
if is_spacy:
|
|
model = cast(GermanSpacyModel, model)
|
|
embd = cast(SpacyDoc, model(text))
|
|
embeddings[idx] = (embd, text)
|
|
# check for empty vectors
|
|
if not embd.vector_norm:
|
|
logger.debug('--- Unknown Words ---')
|
|
logger.debug('embd.text: %s has no vector', embd.text)
|
|
elif is_STRF:
|
|
model = cast(SentenceTransformer, model)
|
|
embd = cast(Tensor, model.encode(text, show_progress_bar=False))
|
|
embeddings[idx] = (embd, text)
|
|
|
|
return embeddings, (is_spacy, is_STRF)
|
|
|
|
|
|
# adapt interface
|
|
# use candidates by index function
|
|
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix
|
|
|
|
|
|
# build similarity matrix out of embeddings
|
|
def build_cosSim_matrix(
|
|
data: Series,
|
|
model: GermanSpacyModel | SentenceTransformer,
|
|
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
|
|
# build empty matrix
|
|
df_index = data.index
|
|
cosineSim_idx_matrix = pd.DataFrame(
|
|
data=0.0, columns=df_index, index=df_index, dtype=np.float32
|
|
)
|
|
|
|
logger.info('Start building embedding map...')
|
|
|
|
# obtain embeddings based on used model
|
|
embds, (is_spacy, is_STRF) = build_embedding_map(
|
|
data=data,
|
|
model=model,
|
|
)
|
|
|
|
logger.info('Embedding map built successfully.')
|
|
|
|
# apply index based mapping for efficient handling of large texts
|
|
combs = combinations(df_index, 2)
|
|
total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
|
|
|
|
logger.info('Start calculation of similarity scores...')
|
|
|
|
for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
|
|
# print(f"{idx1=}, {idx2=}")
|
|
embd1 = embds[idx1][0]
|
|
embd2 = embds[idx2][0]
|
|
|
|
# calculate similarity based on model type
|
|
if is_spacy:
|
|
embd1 = cast(SpacyDoc, embds[idx1][0])
|
|
embd2 = cast(SpacyDoc, embds[idx2][0])
|
|
cosSim = embd1.similarity(embd2)
|
|
elif is_STRF:
|
|
embd1 = cast(Tensor, embds[idx1][0])
|
|
embd2 = cast(Tensor, embds[idx2][0])
|
|
cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
|
|
cosSim = cast(float, cosSim.item())
|
|
|
|
cosineSim_idx_matrix.at[idx1, idx2] = cosSim
|
|
|
|
logger.info('Similarity scores calculated successfully.')
|
|
|
|
return cosineSim_idx_matrix, embds
|
|
|
|
|
|
# obtain index pairs with cosine similarity
|
|
# greater than or equal to given threshold value
|
|
def filt_thresh_cosSim_matrix(
|
|
cosineSim_idx_matrix: DataFrame,
|
|
embds: dict[int, tuple[Embedding, str]],
|
|
threshold: float,
|
|
) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
|
|
"""filter similarity matrix by threshold value and return index pairs with
|
|
a similarity score greater than the provided threshold
|
|
|
|
Parameters
|
|
----------
|
|
threshold : float
|
|
similarity threshold
|
|
cosineSim_idx_matrix : DataFrame
|
|
similarity matrix
|
|
|
|
Returns
|
|
-------
|
|
Series
|
|
series with multi index (index pairs) and corresponding similarity score
|
|
"""
|
|
cosineSim_filt = cast(
|
|
Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
|
|
)
|
|
|
|
return cosineSim_filt, embds
|
|
|
|
|
|
def list_cosSim_dupl_candidates(
|
|
cosineSim_filt: Series,
|
|
embds: dict[int, tuple[Embedding, str]],
|
|
save_candidates: bool = False,
|
|
saving_path: Path | None = None,
|
|
filename: str = 'CosSim-FilterCandidates',
|
|
pipeline: Pipeline | None = None,
|
|
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
|
"""providing an overview of candidates with a similarity score greater than
|
|
given threshold; more suitable for debugging purposes
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
contains indices, corresponding texts and similarity score to evaluate results
|
|
list[tuple[Index, Index]]
|
|
list containing relevant index pairs for entries with similarity score greater than
|
|
given threshold
|
|
"""
|
|
logger.info('Start gathering of similarity candidates...')
|
|
# compare found duplicates
|
|
columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
|
df_candidates = pd.DataFrame(columns=columns)
|
|
|
|
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
|
|
|
for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)): # type: ignore
|
|
# get text content from embedding as second tuple entry
|
|
content = [
|
|
[
|
|
idx1,
|
|
embds[idx1][1],
|
|
idx2,
|
|
embds[idx2][1],
|
|
score,
|
|
]
|
|
]
|
|
# add candidates to collection DataFrame
|
|
df_conc = pd.DataFrame(columns=columns, data=content)
|
|
if df_candidates.empty:
|
|
df_candidates = df_conc.copy()
|
|
else:
|
|
df_candidates = pd.concat([df_candidates, df_conc])
|
|
# save index pairs
|
|
index_pairs.append((idx1, idx2))
|
|
|
|
logger.info('Similarity candidates gathered successfully.')
|
|
|
|
if save_candidates:
|
|
if saving_path is None:
|
|
raise ValueError(
|
|
('Saving path must be provided if duplicate ' 'candidates should be saved.')
|
|
)
|
|
elif pipeline is not None:
|
|
target_filename = (
|
|
f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
|
|
)
|
|
elif pipeline is None:
|
|
target_filename = f'{filename}.xlsx'
|
|
logger.info('Saving similarity candidates...')
|
|
target_path = saving_path.joinpath(target_filename)
|
|
df_candidates.to_excel(target_path)
|
|
logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
|
|
|
|
return index_pairs, embds
|
|
|
|
|
|
# TODO: change implementation fully to SentenceTransformer
|
|
# usage of batch processing for embeddings, use candidate idx function
|
|
# from time analysis --> moved to ``helpers.py``
|
|
"""
|
|
def similar_ids_connection_graph(
|
|
similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
|
) -> tuple[Graph, dict[str, int]]:
|
|
# build index graph to obtain graph of connected (similar) indices
|
|
# use this graph to get connected components (indices which belong together)
|
|
# retain semantic connection on whole dataset
|
|
similar_id_graph = nx.Graph()
|
|
for (idx1, idx2) in similar_idx_pairs:
|
|
# inplace operation, parent/child do not really exist in undirected graph
|
|
update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
|
|
|
|
graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
|
|
|
|
return similar_id_graph, graph_info
|
|
|
|
def similar_ids_groups(
|
|
dupl_id_graph: Graph,
|
|
) -> Iterator[list[PandasIndex]]:
|
|
# groups of connected indices
|
|
ids_groups = cast(Iterator[set[PandasIndex]],
|
|
nx.connected_components(G=dupl_id_graph))
|
|
|
|
for id_group in ids_groups:
|
|
yield list(id_group)
|
|
"""
|
|
|
|
|
|
# merge duplicates
|
|
def merge_similarity_dupl_old(
|
|
data: DataFrame,
|
|
dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
|
|
) -> tuple[DataFrame]:
|
|
# copy pre-cleaned data
|
|
temp = data.copy()
|
|
index = temp.index
|
|
# logger.info("Start merging of similarity candidates...")
|
|
|
|
# iterate over index pairs
|
|
for i1, i2 in tqdm(dupl_idx_pairs):
|
|
# if an entry does not exist any more, skip this pair
|
|
if i1 not in index or i2 not in index:
|
|
continue
|
|
|
|
# merge num occur
|
|
num_occur1 = temp.at[i1, 'num_occur']
|
|
num_occur2 = temp.at[i2, 'num_occur']
|
|
new_num_occur = num_occur1 + num_occur2
|
|
|
|
# merge associated object ids
|
|
assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
|
|
assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
|
|
new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
|
|
new_assoc_ids = np.unique(new_assoc_ids.flatten())
|
|
|
|
# recalculate num associated obj ids
|
|
new_num_assoc_obj_ids = len(new_assoc_ids)
|
|
|
|
# write properties to first entry
|
|
temp.at[i1, 'num_occur'] = new_num_occur
|
|
temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
|
|
temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
|
|
|
|
# drop second entry
|
|
temp = temp.drop(index=i2)
|
|
index = temp.index
|
|
|
|
# logger.info("Similarity candidates merged successfully.")
|
|
|
|
return (temp,)
|
|
|
|
|
|
# ** debugging and evaluation
|
|
def choose_cosSim_dupl_candidates(
|
|
cosineSim_filt: Series,
|
|
embds: dict[int, tuple[Embedding, str]],
|
|
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
|
|
"""providing an overview of candidates with a similarity score greater than
|
|
given threshold, but decision is made manually by iterating through the candidates
|
|
with user interaction; more suitable for debugging purposes
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
contains indices, corresponding texts and similarity score to evaluate results
|
|
list[tuple[Index, Index]]
|
|
list containing relevant index pairs for entries with similarity score greater than
|
|
given threshold
|
|
"""
|
|
|
|
# compare found duplicates
|
|
columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
|
|
df_candidates = pd.DataFrame(columns=columns)
|
|
|
|
index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
|
|
|
|
for (idx1, idx2), score in cosineSim_filt.items(): # type: ignore
|
|
# get texts for comparison
|
|
text1 = embds[idx1][1]
|
|
text2 = embds[idx2][1]
|
|
# get decision
|
|
print('---------- New Decision ----------')
|
|
print('text1:\n', text1, '\n', flush=True)
|
|
print('text2:\n', text2, '\n', flush=True)
|
|
decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
|
|
|
|
if not decision == 'y':
|
|
continue
|
|
|
|
# get text content from embedding as second tuple entry
|
|
content = [
|
|
[
|
|
idx1,
|
|
text1,
|
|
idx2,
|
|
text2,
|
|
score,
|
|
]
|
|
]
|
|
df_conc = pd.DataFrame(columns=columns, data=content)
|
|
|
|
df_candidates = pd.concat([df_candidates, df_conc])
|
|
index_pairs.append((idx1, idx2))
|
|
|
|
return df_candidates, index_pairs
|