lang-main/docs/lang_main/analysis/shared.html

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.shared API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.shared</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.shared.candidates_by_index"><code class="name flex">
<span>def <span class="ident">candidates_by_index</span></span>(<span>data_model_input: pandas.core.series.Series,<br>model: sentence_transformers.SentenceTransformer.SentenceTransformer,<br>cos_sim_threshold: float = 0.5) ‑> Iterator[tuple[int | numpy.int64, int | numpy.int64]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
    cos_sim_threshold: float = 0.5,
) -&gt; Iterator[tuple[PandasIndex, PandasIndex]]:
    &#34;&#34;&#34;function to filter candidate indices based on cosine similarity
    using SentenceTransformer model in batch mode,
    feed data as Series to retain information about indices of entries and
    access them later in the original dataset

    Parameters
    ----------
    obj_id : ObjectID
        _description_
    data_model_input : Series
        containing indices and text entries to process
    model : SentenceTransformer
        necessary SentenceTransformer model to encode text entries
    cos_sim_threshold : float, optional
        threshold for cosine similarity to filter candidates, by default 0.5

    Yields
    ------
    Iterator[tuple[PandasIndex, PandasIndex]]
        tuple of index pairs which meet the cosine similarity threshold
    &#34;&#34;&#34;
    # embeddings
    batch = cast(list[str], data_model_input.to_list())
    embds = cast(
        Tensor,
        model.encode(
            batch,
            convert_to_numpy=False,
            convert_to_tensor=True,
            show_progress_bar=False,
        ),
    )
    # cosine similarity
    cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
    np.fill_diagonal(cos_sim, 0.0)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim &gt;= cos_sim_threshold)

    for idx_array in cos_sim_idx:
        idx_pair = cast(
            tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
        )
        yield idx_pair</code></pre>
</details>
<div class="desc"><p>function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>obj_id</code></strong> :&ensp;<code>ObjectID</code></dt>
<dd><em>description</em></dd>
<dt><strong><code>data_model_input</code></strong> :&ensp;<code>Series</code></dt>
<dd>containing indices and text entries to process</dd>
<dt><strong><code>model</code></strong> :&ensp;<code>SentenceTransformer</code></dt>
<dd>necessary SentenceTransformer model to encode text entries</dd>
<dt><strong><code>cos_sim_threshold</code></strong> :&ensp;<code>float</code>, optional</dt>
<dd>threshold for cosine similarity to filter candidates, by default 0.5</dd>
</dl>
<h2 id="yields">Yields</h2>
<dl>
<dt><code>Iterator[tuple[PandasIndex, PandasIndex]]</code></dt>
<dd>tuple of index pairs which meet the cosine similarity threshold</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.clean_string_slim"><code class="name flex">
<span>def <span class="ident">clean_string_slim</span></span>(<span>string: str) ‑> str</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def clean_string_slim(string: str) -&gt; str:
    &#34;&#34;&#34;mapping function to clean single string entries in a series (feature-wise)
    of the dataset, used to be applied element-wise for string features

    Parameters
    ----------
    string : str
        dataset entry feature

    Returns
    -------
    str
        cleaned entry
    &#34;&#34;&#34;
    # remove special chars
    # string = pattern_escape_newline.sub(&#39; &#39;, string)
    string = pattern_escape_seq.sub(&#39; &#39;, string)
    string = pattern_repeated_chars.sub(&#39;&#39;, string)
    # string = pattern_dates.sub(&#39;&#39;, string)
    # dates are used for context, should not be removed at this stage
    string = pattern_whitespace.sub(&#39; &#39;, string)
    # remove whitespaces at the beginning and the end
    string = string.strip()

    return string</code></pre>
</details>
<div class="desc"><p>mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
<dd>dataset entry feature</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>str</code></dt>
<dd>cleaned entry</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.entry_wise_cleansing"><code class="name flex">
<span>def <span class="ident">entry_wise_cleansing</span></span>(<span>data: pandas.core.frame.DataFrame,<br>target_features: Collection[str],<br>cleansing_func: Callable[[str], str] = &lt;function clean_string_slim&gt;) ‑> tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entry_wise_cleansing(
    data: DataFrame,
    target_features: Collection[str],
    cleansing_func: Callable[[str], str] = clean_string_slim,
) -&gt; tuple[DataFrame]:
    # apply given cleansing function to target feature
    target_features = list(target_features)
    data[target_features] = data[target_features].map(cleansing_func)
    logger.info(
        (&#39;Successfully applied entry-wise cleansing procedure &gt;&gt;%s&lt;&lt; for features &gt;&gt;%s&lt;&lt;&#39;),
        cleansing_func.__name__,
        target_features,
    )
    return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_connection_graph"><code class="name flex">
<span>def <span class="ident">similar_index_connection_graph</span></span>(<span>similar_idx_pairs: Iterable[tuple[int | numpy.int64, int | numpy.int64]]) ‑> tuple[networkx.classes.graph.Graph, dict[str, float]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_connection_graph(
    similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
) -&gt; tuple[Graph, dict[str, float]]:
    # build index graph to obtain graph of connected (similar) indices
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
    # for idx1, idx2 in similar_idx_pairs:
    #     # inplace operation, parent/child do not really exist in undirected graph
    #     update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
    update_graph(graph=similar_id_graph, batch=similar_idx_pairs)

    graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)

    return similar_id_graph, graph_info</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_groups"><code class="name flex">
<span>def <span class="ident">similar_index_groups</span></span>(<span>similar_id_graph: networkx.classes.graph.Graph) ‑> Iterator[tuple[int | numpy.int64, ...]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_groups(
    similar_id_graph: Graph,
) -&gt; Iterator[tuple[PandasIndex, ...]]:
    # groups of connected indices
    ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))

    for id_group in ids_groups:
        yield tuple(id_group)</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.shared.candidates_by_index" href="#lang_main.analysis.shared.candidates_by_index">candidates_by_index</a></code></li>
<li><code><a title="lang_main.analysis.shared.clean_string_slim" href="#lang_main.analysis.shared.clean_string_slim">clean_string_slim</a></code></li>
<li><code><a title="lang_main.analysis.shared.entry_wise_cleansing" href="#lang_main.analysis.shared.entry_wise_cleansing">entry_wise_cleansing</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_connection_graph" href="#lang_main.analysis.shared.similar_index_connection_graph">similar_index_connection_graph</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_groups" href="#lang_main.analysis.shared.similar_index_groups">similar_index_groups</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>