Module lang_main.analysis.shared

Functions

def candidates_by_index(data_model_input: pandas.core.series.Series,
model: sentence_transformers.SentenceTransformer.SentenceTransformer,
cos_sim_threshold: float = 0.5) ‑> Iterator[tuple[int | numpy.int64, int | numpy.int64]]
Expand source code
def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
    cos_sim_threshold: float = 0.5,
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
    """function to filter candidate indices based on cosine similarity
    using SentenceTransformer model in batch mode,
    feed data as Series to retain information about indices of entries and
    access them later in the original dataset

    Parameters
    ----------
    obj_id : ObjectID
        _description_
    data_model_input : Series
        containing indices and text entries to process
    model : SentenceTransformer
        necessary SentenceTransformer model to encode text entries
    cos_sim_threshold : float, optional
        threshold for cosine similarity to filter candidates, by default 0.5

    Yields
    ------
    Iterator[tuple[PandasIndex, PandasIndex]]
        tuple of index pairs which meet the cosine similarity threshold
    """
    # embeddings
    batch = cast(list[str], data_model_input.to_list())
    embds = cast(
        Tensor,
        model.encode(
            batch,
            convert_to_numpy=False,
            convert_to_tensor=True,
            show_progress_bar=False,
        ),
    )
    # cosine similarity
    cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
    np.fill_diagonal(cos_sim, 0.0)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)

    for idx_array in cos_sim_idx:
        idx_pair = cast(
            tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
        )
        yield idx_pair

function to filter candidate indices based on cosine similarity using SentenceTransformer model in batch mode, feed data as Series to retain information about indices of entries and access them later in the original dataset

Parameters

obj_id : ObjectID
description
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5

Yields

Iterator[tuple[PandasIndex, PandasIndex]]
tuple of index pairs which meet the cosine similarity threshold
def clean_string_slim(string: str) ‑> str
Expand source code
def clean_string_slim(string: str) -> str:
    """mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features

    Parameters
    ----------
    string : str
        dataset entry feature

    Returns
    -------
    str
        cleaned entry
    """
    # remove special chars
    # string = pattern_escape_newline.sub(' ', string)
    string = pattern_escape_seq.sub(' ', string)
    string = pattern_repeated_chars.sub('', string)
    # string = pattern_dates.sub('', string)
    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(' ', string)
    # remove whitespaces at the beginning and the end
    string = string.strip()

    return string

mapping function to clean single string entries in a series (feature-wise) of the dataset, used to be applied element-wise for string features

Parameters

string : str
dataset entry feature

Returns

str
cleaned entry
def entry_wise_cleansing(data: pandas.core.frame.DataFrame,
target_features: Collection[str],
cleansing_func: Callable[[str], str] = <function clean_string_slim>) ‑> tuple[pandas.core.frame.DataFrame]
Expand source code
def entry_wise_cleansing(
    data: DataFrame,
    target_features: Collection[str],
    cleansing_func: Callable[[str], str] = clean_string_slim,
) -> tuple[DataFrame]:
    # apply given cleansing function to target feature
    target_features = list(target_features)
    data[target_features] = data[target_features].map(cleansing_func)
    logger.info(
        ('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
        cleansing_func.__name__,
        target_features,
    )
    return (data,)
def similar_index_connection_graph(similar_idx_pairs: Iterable[tuple[int | numpy.int64, int | numpy.int64]]) ‑> tuple[networkx.classes.graph.Graph, dict[str, float]]
Expand source code
def similar_index_connection_graph(
    similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
) -> tuple[Graph, dict[str, float]]:
    # build index graph to obtain graph of connected (similar) indices
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
    # for idx1, idx2 in similar_idx_pairs:
    #     # inplace operation, parent/child do not really exist in undirected graph
    #     update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
    update_graph(graph=similar_id_graph, batch=similar_idx_pairs)

    graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)

    return similar_id_graph, graph_info
def similar_index_groups(similar_id_graph: networkx.classes.graph.Graph) ‑> Iterator[tuple[int | numpy.int64, ...]]
Expand source code
def similar_index_groups(
    similar_id_graph: Graph,
) -> Iterator[tuple[PandasIndex, ...]]:
    # groups of connected indices
    ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))

    for id_group in ids_groups:
        yield tuple(id_group)