lang-main/src/lang_main/analysis/preprocessing.py

from collections.abc import Collection
from itertools import combinations
from math import factorial
from pathlib import Path
from typing import cast

import numpy as np
import pandas as pd
import sentence_transformers
import sentence_transformers.util
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
from spacy.lang.de import German as GermanSpacyModel
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
from tqdm import tqdm

from lang_main.analysis.shared import (
    candidates_by_index,
    similar_index_connection_graph,
    similar_index_groups,
)
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import Pipeline
from lang_main.types import Embedding, PandasIndex

# TODO removal
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
# pattern_whitespace = re.compile(r'[ ]{2,}')


# ** (1) dataset preparation: loading and simple preprocessing
# following functions used to load a given dataset and perform simple
# duplicate cleansing based on all properties
def load_raw_data(
    path: Path,
    date_cols: Collection[str] = (
        'VorgangsDatum',
        'ErledigungsDatum',
        'Arbeitsbeginn',
        'ErstellungsDatum',
    ),
) -> tuple[DataFrame]:
    """load IHM dataset with standard structure

    Parameters
    ----------
    path : str
        path to dataset file, usually CSV file
    date_cols : Collection[str], optional
        columns which contain dates and are parsed as such,
        by default (
            'VorgangsDatum',
            'ErledigungsDatum',
            'Arbeitsbeginn',
            'ErstellungsDatum',
        )

    Returns
    -------
    DataFrame
        raw dataset as DataFrame
    """
    # load dataset
    date_cols = list(date_cols)
    data = pd.read_csv(
        filepath_or_buffer=path,
        sep=';',
        encoding='cp1252',
        parse_dates=list(date_cols),
        dayfirst=True,
    )
    logger.info('Loaded dataset successfully.')
    logger.info(
        (
            f'Dataset properties: number of entries: {len(data)}, '
            f'number of features {len(data.columns)}'
        )
    )
    return (data,)


def remove_duplicates(
    data: DataFrame,
) -> tuple[DataFrame]:
    """removes duplicated entries over all features in the given dataset

    Parameters
    ----------
    data : DataFrame
        read data with standard structure

    Returns
    -------
    DataFrame
        dataset with removed duplicates over all features
    """
    # obtain info about duplicates over all features
    duplicates_filt = data.duplicated()
    logger.info(f'Number of duplicates over all features: {duplicates_filt.sum()}')
    # drop duplicates
    wo_duplicates = data.drop_duplicates(ignore_index=True)
    duplicates_subset: list[str] = [
        'VorgangsID',
        'ObjektID',
    ]
    duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
    logger.info(
        (
            'Number of duplicates over subset '
            f'>>{duplicates_subset}<<: {duplicates_subset_filt.sum()}'
        )
    )
    wo_duplicates = wo_duplicates.drop_duplicates(
        subset=duplicates_subset, ignore_index=True
    ).copy()
    logger.info('Removed all duplicates from dataset successfully.')
    logger.info(
        'New Dataset properties: number of entries: %d, number of features %d',
        len(wo_duplicates),
        len(wo_duplicates.columns),
    )

    return (wo_duplicates,)


def remove_NA(
    data: DataFrame,
    target_features: Collection[str] = ('VorgangsBeschreibung',),
) -> tuple[DataFrame]:
    """function to drop NA entries based on a subset of features to be analysed

    Parameters
    ----------
    data : DataFrame
        standard IHM dataset, perhaps pre-cleaned
    target_features : Collection[str], optional
        subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)

    Returns
    -------
    DataFrame
        dataset with removed NA entries for given subset of features
    """
    target_features = list(target_features)
    wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy()  # type: ignore
    logger.info(
        f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
    )

    return (wo_NA,)


# ** (2) entry-based cleansing
# ** moved to module ``lang_main.analysis.shared``


# ** in-depth analysis of one feature
# following functions try to gain insights on a given feature of the IHM dataset such
# as number of occurrences or associated Object IDs
def analyse_feature(
    data: DataFrame,
    target_feature: str,
) -> tuple[DataFrame]:
    # feature columns
    feature_entries = data[target_feature]
    logger.info(
        'Number of entries for feature >>%s<<: %d', target_feature, len(feature_entries)
    )
    # obtain unique entries
    unique_feature_entries = feature_entries.unique()

    # prepare result DataFrame
    cols = ['batched_idxs', 'entry', 'len', 'num_occur', 'assoc_obj_ids', 'num_assoc_obj_ids']
    result_df = pd.DataFrame(columns=cols)

    for entry in tqdm(unique_feature_entries, mininterval=1.0):
        len_entry = len(entry)
        filt = data[target_feature] == entry
        temp = data[filt]
        batched_idxs = temp.index.to_numpy()
        assoc_obj_ids = temp['ObjektID'].unique()
        assoc_obj_ids = np.sort(assoc_obj_ids, kind='stable')
        num_assoc_obj_ids = len(assoc_obj_ids)
        num_dupl = filt.sum()

        conc_df = pd.DataFrame(
            data=[
                [batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
            ],
            columns=cols,
        )

        result_df = pd.concat([result_df, conc_df], ignore_index=True)

    result_df = result_df.sort_values(by='num_occur', ascending=False).copy()

    return (result_df,)


# ** pre-filter
def numeric_pre_filter_feature(
    data: DataFrame,
    feature: str,
    bound_lower: int | None,
    bound_upper: int | None,
) -> tuple[DataFrame]:
    if not any([bound_lower, bound_upper]):
        raise ValueError('No bounds for filtering provided')

    data = data.copy()
    if bound_lower is None:
        bound_lower = cast(int, data[feature].min())
    if bound_upper is None:
        bound_upper = cast(int, data[feature].max())

    filter_lower = data[feature] >= bound_lower
    filter_upper = data[feature] <= bound_upper
    filter = filter_lower & filter_upper

    data = data.loc[filter]

    return (data,)


# ** embedding based similarity
# following functions used to identify similar entries to have
# a more robust identification of duplicates negating negative side effects
# of several disturbances like typos, escape characters, etc.
# build mapping of embeddings for given model
def merge_similarity_dupl(
    data: DataFrame,
    model: SentenceTransformer,
    cos_sim_threshold: float,
) -> tuple[DataFrame]:
    logger.info('Start merging of similarity candidates...')

    # data
    merged_data = data.copy()
    model_input = merged_data['entry']
    candidates_idx = candidates_by_index(
        data_model_input=model_input,
        model=model,
        cos_sim_threshold=cos_sim_threshold,
    )
    # graph of similar ids
    similar_id_graph, _ = similar_index_connection_graph(candidates_idx)

    for similar_id_group in similar_index_groups(similar_id_graph):
        similar_id_group = list(similar_id_group)
        similar_data = merged_data.loc[similar_id_group, :]
        # keep first entry with max number occurrences, then number of
        # associated objects, then length of entry
        similar_data = similar_data.sort_values(
            by=['num_occur', 'num_assoc_obj_ids', 'len'],
            ascending=[False, False, False],
        )
        # merge information to first entry
        data_idx = cast(PandasIndex, similar_data.index[0])
        similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
        assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
        assoc_obj_ids = np.concatenate(assoc_obj_ids)
        assoc_obj_ids = np.unique(assoc_obj_ids)
        similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
        similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
        # remaining indices, should be removed
        similar_id_group.remove(data_idx)
        merged_similar_data = similar_data.drop(index=similar_id_group)
        # update entry in main dataset, drop remaining entries
        merged_data.update(merged_similar_data)
        merged_data = merged_data.drop(index=similar_id_group)

    logger.info('Similarity candidates merged successfully.')

    return (merged_data,)


# ** #################################################################################
# TODO check removal
def build_embedding_map(
    data: Series,
    model: GermanSpacyModel | SentenceTransformer,
) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
    # dictionary with embeddings
    embeddings: dict[int, tuple[Embedding, str]] = {}
    is_spacy = False
    is_STRF = False

    if isinstance(model, GermanSpacyModel):
        is_spacy = True
    elif isinstance(model, SentenceTransformer):
        is_STRF = True

    if not any((is_spacy, is_STRF)):
        raise NotImplementedError('Model type unknown')

    for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
        # verbose code: Pyright not inferring types correctly
        idx = cast(int, idx)
        text = cast(str, text)
        if is_spacy:
            model = cast(GermanSpacyModel, model)
            embd = cast(SpacyDoc, model(text))
            embeddings[idx] = (embd, text)
            # check for empty vectors
            if not embd.vector_norm:
                logger.debug('--- Unknown Words ---')
                logger.debug('embd.text: %s has no vector', embd.text)
        elif is_STRF:
            model = cast(SentenceTransformer, model)
            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
            embeddings[idx] = (embd, text)

    return embeddings, (is_spacy, is_STRF)


# adapt interface
# use candidates by index function
# merges: build_embedding_map, build_cosSim_matrix, filt_thresh_cosSim_matrix


# build similarity matrix out of embeddings
def build_cosSim_matrix(
    data: Series,
    model: GermanSpacyModel | SentenceTransformer,
) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
    # build empty matrix
    df_index = data.index
    cosineSim_idx_matrix = pd.DataFrame(
        data=0.0, columns=df_index, index=df_index, dtype=np.float32
    )

    logger.info('Start building embedding map...')

    # obtain embeddings based on used model
    embds, (is_spacy, is_STRF) = build_embedding_map(
        data=data,
        model=model,
    )

    logger.info('Embedding map built successfully.')

    # apply index based mapping for efficient handling of large texts
    combs = combinations(df_index, 2)
    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)

    logger.info('Start calculation of similarity scores...')

    for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
        # print(f"{idx1=}, {idx2=}")
        embd1 = embds[idx1][0]
        embd2 = embds[idx2][0]

        # calculate similarity based on model type
        if is_spacy:
            embd1 = cast(SpacyDoc, embds[idx1][0])
            embd2 = cast(SpacyDoc, embds[idx2][0])
            cosSim = embd1.similarity(embd2)
        elif is_STRF:
            embd1 = cast(Tensor, embds[idx1][0])
            embd2 = cast(Tensor, embds[idx2][0])
            cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
            cosSim = cast(float, cosSim.item())

        cosineSim_idx_matrix.at[idx1, idx2] = cosSim

    logger.info('Similarity scores calculated successfully.')

    return cosineSim_idx_matrix, embds


# obtain index pairs with cosine similarity
# greater than or equal to given threshold value
def filt_thresh_cosSim_matrix(
    cosineSim_idx_matrix: DataFrame,
    embds: dict[int, tuple[Embedding, str]],
    threshold: float,
) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
    """filter similarity matrix by threshold value and return index pairs with
    a similarity score greater than the provided threshold

    Parameters
    ----------
    threshold : float
        similarity threshold
    cosineSim_idx_matrix : DataFrame
        similarity matrix

    Returns
    -------
    Series
        series with multi index (index pairs) and corresponding similarity score
    """
    cosineSim_filt = cast(
        Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
    )

    return cosineSim_filt, embds


def list_cosSim_dupl_candidates(
    cosineSim_filt: Series,
    embds: dict[int, tuple[Embedding, str]],
    save_candidates: bool = False,
    saving_path: Path | None = None,
    filename: str = 'CosSim-FilterCandidates',
    pipeline: Pipeline | None = None,
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
    """providing an overview of candidates with a similarity score greater than
    given threshold; more suitable for debugging purposes

    Returns
    -------
    DataFrame
        contains indices, corresponding texts and similarity score to evaluate results
    list[tuple[Index, Index]]
        list containing relevant index pairs for entries with similarity score greater than
        given threshold
    """
    logger.info('Start gathering of similarity candidates...')
    # compare found duplicates
    columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)

    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []

    for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
        # get text content from embedding as second tuple entry
        content = [
            [
                idx1,
                embds[idx1][1],
                idx2,
                embds[idx2][1],
                score,
            ]
        ]
        # add candidates to collection DataFrame
        df_conc = pd.DataFrame(columns=columns, data=content)
        if df_candidates.empty:
            df_candidates = df_conc.copy()
        else:
            df_candidates = pd.concat([df_candidates, df_conc])
        # save index pairs
        index_pairs.append((idx1, idx2))

    logger.info('Similarity candidates gathered successfully.')

    if save_candidates:
        if saving_path is None:
            raise ValueError(
                ('Saving path must be provided if duplicate ' 'candidates should be saved.')
            )
        elif pipeline is not None:
            target_filename = (
                f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
            )
        elif pipeline is None:
            target_filename = f'{filename}.xlsx'
        logger.info('Saving similarity candidates...')
        target_path = saving_path.joinpath(target_filename)
        df_candidates.to_excel(target_path)
        logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)

    return index_pairs, embds


# TODO: change implementation fully to SentenceTransformer
# usage of batch processing for embeddings, use candidate idx function
# from time analysis --> moved to ``helpers.py``
"""
def similar_ids_connection_graph(
    similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[Graph, dict[str, int]]:
    # build index graph to obtain graph of connected (similar) indices
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
    for (idx1, idx2) in similar_idx_pairs:
        # inplace operation, parent/child do not really exist in undirected graph
        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)

    graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)

    return similar_id_graph, graph_info

def similar_ids_groups(
    dupl_id_graph: Graph,
) -> Iterator[list[PandasIndex]]:
    # groups of connected indices
    ids_groups = cast(Iterator[set[PandasIndex]],
                      nx.connected_components(G=dupl_id_graph))

    for id_group in ids_groups:
        yield list(id_group)
"""


# merge duplicates
def merge_similarity_dupl_old(
    data: DataFrame,
    dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
) -> tuple[DataFrame]:
    # copy pre-cleaned data
    temp = data.copy()
    index = temp.index
    # logger.info("Start merging of similarity candidates...")

    # iterate over index pairs
    for i1, i2 in tqdm(dupl_idx_pairs):
        # if an entry does not exist any more, skip this pair
        if i1 not in index or i2 not in index:
            continue

        # merge num occur
        num_occur1 = temp.at[i1, 'num_occur']
        num_occur2 = temp.at[i2, 'num_occur']
        new_num_occur = num_occur1 + num_occur2

        # merge associated object ids
        assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
        assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
        new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
        new_assoc_ids = np.unique(new_assoc_ids.flatten())

        # recalculate num associated obj ids
        new_num_assoc_obj_ids = len(new_assoc_ids)

        # write properties to first entry
        temp.at[i1, 'num_occur'] = new_num_occur
        temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
        temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids

        # drop second entry
        temp = temp.drop(index=i2)
        index = temp.index

    # logger.info("Similarity candidates merged successfully.")

    return (temp,)


# ** debugging and evaluation
def choose_cosSim_dupl_candidates(
    cosineSim_filt: Series,
    embds: dict[int, tuple[Embedding, str]],
) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
    """providing an overview of candidates with a similarity score greater than
    given threshold, but decision is made manually by iterating through the candidates
    with user interaction; more suitable for debugging purposes

    Returns
    -------
    DataFrame
        contains indices, corresponding texts and similarity score to evaluate results
    list[tuple[Index, Index]]
        list containing relevant index pairs for entries with similarity score greater than
        given threshold
    """

    # compare found duplicates
    columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
    df_candidates = pd.DataFrame(columns=columns)

    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []

    for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
        # get texts for comparison
        text1 = embds[idx1][1]
        text2 = embds[idx2][1]
        # get decision
        print('---------- New Decision ----------')
        print('text1:\n', text1, '\n', flush=True)
        print('text2:\n', text2, '\n', flush=True)
        decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')

        if not decision == 'y':
            continue

        # get text content from embedding as second tuple entry
        content = [
            [
                idx1,
                text1,
                idx2,
                text2,
                score,
            ]
        ]
        df_conc = pd.DataFrame(columns=columns, data=content)

        df_candidates = pd.concat([df_candidates, df_conc])
        index_pairs.append((idx1, idx2))

    return df_candidates, index_pairs