Module lang_main.analysis.shared
Functions
-
Expand source code
def candidates_by_index( data_model_input: Series, model: SentenceTransformer, cos_sim_threshold: float = 0.5, ) -> Iterator[tuple[PandasIndex, PandasIndex]]: """function to filter candidate indices based on cosine similarity using SentenceTransformer model in batch mode, feed data as Series to retain information about indices of entries and access them later in the original dataset Parameters ---------- obj_id : ObjectID _description_ data_model_input : Series containing indices and text entries to process model : SentenceTransformer necessary SentenceTransformer model to encode text entries cos_sim_threshold : float, optional threshold for cosine similarity to filter candidates, by default 0.5 Yields ------ Iterator[tuple[PandasIndex, PandasIndex]] tuple of index pairs which meet the cosine similarity threshold """ # embeddings batch = cast(list[str], data_model_input.to_list()) embds = cast( Tensor, model.encode( batch, convert_to_numpy=False, convert_to_tensor=True, show_progress_bar=False, ), ) # cosine similarity cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy()) np.fill_diagonal(cos_sim, 0.0) cos_sim = np.triu(cos_sim) cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold) for idx_array in cos_sim_idx: idx_pair = cast( tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array) ) yield idx_pairfunction to filter candidate indices based on cosine similarity using SentenceTransformer model in batch mode, feed data as Series to retain information about indices of entries and access them later in the original dataset
Parameters
obj_id:ObjectID- description
data_model_input:Series- containing indices and text entries to process
model:SentenceTransformer- necessary SentenceTransformer model to encode text entries
cos_sim_threshold:float, optional- threshold for cosine similarity to filter candidates, by default 0.5
Yields
Iterator[tuple[PandasIndex, PandasIndex]]- tuple of index pairs which meet the cosine similarity threshold
-
Expand source code
def clean_string_slim(string: str) -> str: """mapping function to clean single string entries in a series (feature-wise) of the dataset, used to be applied element-wise for string features Parameters ---------- string : str dataset entry feature Returns ------- str cleaned entry """ # remove special chars # string = pattern_escape_newline.sub(' ', string) string = pattern_escape_seq.sub(' ', string) string = pattern_repeated_chars.sub('', string) # string = pattern_dates.sub('', string) # dates are used for context, should not be removed at this stage string = pattern_whitespace.sub(' ', string) # remove whitespaces at the beginning and the end string = string.strip() return stringmapping function to clean single string entries in a series (feature-wise) of the dataset, used to be applied element-wise for string features
Parameters
string:str- dataset entry feature
Returns
str- cleaned entry
-
Expand source code
def entry_wise_cleansing( data: DataFrame, target_features: Collection[str], cleansing_func: Callable[[str], str] = clean_string_slim, ) -> tuple[DataFrame]: # apply given cleansing function to target feature target_features = list(target_features) data[target_features] = data[target_features].map(cleansing_func) logger.info( ('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'), cleansing_func.__name__, target_features, ) return (data,) -
Expand source code
def similar_index_connection_graph( similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]], ) -> tuple[Graph, dict[str, float]]: # build index graph to obtain graph of connected (similar) indices # use this graph to get connected components (indices which belong together) # retain semantic connection on whole dataset similar_id_graph = nx.Graph() # for idx1, idx2 in similar_idx_pairs: # # inplace operation, parent/child do not really exist in undirected graph # update_graph(graph=similar_id_graph, parent=idx1, child=idx2) update_graph(graph=similar_id_graph, batch=similar_idx_pairs) graph_info = get_graph_metadata(graph=similar_id_graph, logging=False) return similar_id_graph, graph_info -
Expand source code
def similar_index_groups( similar_id_graph: Graph, ) -> Iterator[tuple[PandasIndex, ...]]: # groups of connected indices ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph)) for id_group in ids_groups: yield tuple(id_group)