2025-01-22 16:54:15 +01:00

274 lines
15 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.shared API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.shared</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.shared.candidates_by_index"><code class="name flex">
<span>def <span class="ident">candidates_by_index</span></span>(<span>data_model_input: pandas.core.series.Series,<br>model: sentence_transformers.SentenceTransformer.SentenceTransformer,<br>cos_sim_threshold: float = 0.5) > Iterator[tuple[int | numpy.int64, int | numpy.int64]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
) -&gt; Iterator[tuple[PandasIndex, PandasIndex]]:
&#34;&#34;&#34;function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[PandasIndex, PandasIndex]]
tuple of index pairs which meet the cosine similarity threshold
&#34;&#34;&#34;
# embeddings
batch = cast(list[str], data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
),
)
# cosine similarity
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
np.fill_diagonal(cos_sim, 0.0)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim &gt;= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair</code></pre>
</details>
<div class="desc"><p>function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>obj_id</code></strong> :&ensp;<code>ObjectID</code></dt>
<dd><em>description</em></dd>
<dt><strong><code>data_model_input</code></strong> :&ensp;<code>Series</code></dt>
<dd>containing indices and text entries to process</dd>
<dt><strong><code>model</code></strong> :&ensp;<code>SentenceTransformer</code></dt>
<dd>necessary SentenceTransformer model to encode text entries</dd>
<dt><strong><code>cos_sim_threshold</code></strong> :&ensp;<code>float</code>, optional</dt>
<dd>threshold for cosine similarity to filter candidates, by default 0.5</dd>
</dl>
<h2 id="yields">Yields</h2>
<dl>
<dt><code>Iterator[tuple[PandasIndex, PandasIndex]]</code></dt>
<dd>tuple of index pairs which meet the cosine similarity threshold</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.clean_string_slim"><code class="name flex">
<span>def <span class="ident">clean_string_slim</span></span>(<span>string: str) > str</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def clean_string_slim(string: str) -&gt; str:
&#34;&#34;&#34;mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
&#34;&#34;&#34;
# remove special chars
# string = pattern_escape_newline.sub(&#39; &#39;, string)
string = pattern_escape_seq.sub(&#39; &#39;, string)
string = pattern_repeated_chars.sub(&#39;&#39;, string)
# string = pattern_dates.sub(&#39;&#39;, string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(&#39; &#39;, string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string</code></pre>
</details>
<div class="desc"><p>mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
<dd>dataset entry feature</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>str</code></dt>
<dd>cleaned entry</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.entry_wise_cleansing"><code class="name flex">
<span>def <span class="ident">entry_wise_cleansing</span></span>(<span>data: pandas.core.frame.DataFrame,<br>target_features: Collection[str],<br>cleansing_func: Callable[[str], str] = &lt;function clean_string_slim&gt;) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entry_wise_cleansing(
data: DataFrame,
target_features: Collection[str],
cleansing_func: Callable[[str], str] = clean_string_slim,
) -&gt; tuple[DataFrame]:
# apply given cleansing function to target feature
target_features = list(target_features)
data[target_features] = data[target_features].map(cleansing_func)
logger.info(
(&#39;Successfully applied entry-wise cleansing procedure &gt;&gt;%s&lt;&lt; for features &gt;&gt;%s&lt;&lt;&#39;),
cleansing_func.__name__,
target_features,
)
return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_connection_graph"><code class="name flex">
<span>def <span class="ident">similar_index_connection_graph</span></span>(<span>similar_idx_pairs: Iterable[tuple[int | numpy.int64, int | numpy.int64]]) > tuple[networkx.classes.graph.Graph, dict[str, float]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_connection_graph(
similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
) -&gt; tuple[Graph, dict[str, float]]:
# build index graph to obtain graph of connected (similar) indices
# use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset
similar_id_graph = nx.Graph()
# for idx1, idx2 in similar_idx_pairs:
# # inplace operation, parent/child do not really exist in undirected graph
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_groups"><code class="name flex">
<span>def <span class="ident">similar_index_groups</span></span>(<span>similar_id_graph: networkx.classes.graph.Graph) > Iterator[tuple[int | numpy.int64, ...]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_groups(
similar_id_graph: Graph,
) -&gt; Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
for id_group in ids_groups:
yield tuple(id_group)</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.shared.candidates_by_index" href="#lang_main.analysis.shared.candidates_by_index">candidates_by_index</a></code></li>
<li><code><a title="lang_main.analysis.shared.clean_string_slim" href="#lang_main.analysis.shared.clean_string_slim">clean_string_slim</a></code></li>
<li><code><a title="lang_main.analysis.shared.entry_wise_cleansing" href="#lang_main.analysis.shared.entry_wise_cleansing">entry_wise_cleansing</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_connection_graph" href="#lang_main.analysis.shared.similar_index_connection_graph">similar_index_connection_graph</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_groups" href="#lang_main.analysis.shared.similar_index_groups">similar_index_groups</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>