added test cases

This commit is contained in:
Florian Förster
2025-01-22 16:54:15 +01:00
parent 30fe71e80a
commit fb28b8548b
28 changed files with 17721 additions and 17 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,98 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis</code></h1>
</header>
<section id="section-intro">
</section>
<section>
<h2 class="section-title" id="header-submodules">Sub-modules</h2>
<dl>
<dt><code class="name"><a title="lang_main.analysis.graphs" href="graphs.html">lang_main.analysis.graphs</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="lang_main.analysis.preprocessing" href="preprocessing.html">lang_main.analysis.preprocessing</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="lang_main.analysis.shared" href="shared.html">lang_main.analysis.shared</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="lang_main.analysis.timeline" href="timeline.html">lang_main.analysis.timeline</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
<dt><code class="name"><a title="lang_main.analysis.tokens" href="tokens.html">lang_main.analysis.tokens</a></code></dt>
<dd>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
<section>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main" href="../index.html">lang_main</a></code></li>
</ul>
</li>
<li><h3><a href="#header-submodules">Sub-modules</a></h3>
<ul>
<li><code><a title="lang_main.analysis.graphs" href="graphs.html">lang_main.analysis.graphs</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing" href="preprocessing.html">lang_main.analysis.preprocessing</a></code></li>
<li><code><a title="lang_main.analysis.shared" href="shared.html">lang_main.analysis.shared</a></code></li>
<li><code><a title="lang_main.analysis.timeline" href="timeline.html">lang_main.analysis.timeline</a></code></li>
<li><code><a title="lang_main.analysis.tokens" href="tokens.html">lang_main.analysis.tokens</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>

View File

@@ -0,0 +1,451 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.preprocessing API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.preprocessing</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.preprocessing.analyse_feature"><code class="name flex">
<span>def <span class="ident">analyse_feature</span></span>(<span>data: DataFrame, target_feature: str) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def analyse_feature(
data: DataFrame,
target_feature: str,
) -&gt; tuple[DataFrame]:
# feature columns
feature_entries = data[target_feature]
logger.info(
&#39;Number of entries for feature &gt;&gt;%s&lt;&lt;: %d&#39;, target_feature, len(feature_entries)
)
# obtain unique entries
unique_feature_entries = feature_entries.unique()
# prepare result DataFrame
cols = [&#39;batched_idxs&#39;, &#39;entry&#39;, &#39;len&#39;, &#39;num_occur&#39;, &#39;assoc_obj_ids&#39;, &#39;num_assoc_obj_ids&#39;]
result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry)
filt = data[target_feature] == entry
temp = data[filt]
batched_idxs = temp.index.to_numpy()
assoc_obj_ids = temp[&#39;ObjektID&#39;].unique()
assoc_obj_ids = np.sort(assoc_obj_ids, kind=&#39;stable&#39;)
num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum()
conc_df = pd.DataFrame(
data=[
[batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
],
columns=cols,
)
result_df = pd.concat([result_df, conc_df], ignore_index=True)
result_df = result_df.sort_values(
by=[&#39;num_occur&#39;, &#39;len&#39;], ascending=[False, False]
).copy()
return (result_df,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.preprocessing.load_raw_data"><code class="name flex">
<span>def <span class="ident">load_raw_data</span></span>(<span>path: Path,<br>date_cols: Collection[str] = ('VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum')) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def load_raw_data(
path: Path,
date_cols: Collection[str] = (
&#39;VorgangsDatum&#39;,
&#39;ErledigungsDatum&#39;,
&#39;Arbeitsbeginn&#39;,
&#39;ErstellungsDatum&#39;,
),
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;load IHM dataset with standard structure
Parameters
----------
path : str
path to dataset file, usually CSV file
date_cols : Collection[str], optional
columns which contain dates and are parsed as such,
by default (
&#39;VorgangsDatum&#39;,
&#39;ErledigungsDatum&#39;,
&#39;Arbeitsbeginn&#39;,
&#39;ErstellungsDatum&#39;,
)
Returns
-------
DataFrame
raw dataset as DataFrame
&#34;&#34;&#34;
# load dataset
date_cols = list(date_cols)
data = pd.read_csv(
filepath_or_buffer=path,
sep=&#39;;&#39;,
encoding=&#39;cp1252&#39;,
parse_dates=list(date_cols),
dayfirst=True,
)
logger.info(&#39;Loaded dataset successfully.&#39;)
logger.info(
(
f&#39;Dataset properties: number of entries: {len(data)}, &#39;
f&#39;number of features {len(data.columns)}&#39;
)
)
return (data,)</code></pre>
</details>
<div class="desc"><p>load IHM dataset with standard structure</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>path</code></strong> :&ensp;<code>str</code></dt>
<dd>path to dataset file, usually CSV file</dd>
<dt><strong><code>date_cols</code></strong> :&ensp;<code>Collection[str]</code>, optional</dt>
<dd>columns which contain dates and are parsed as such,
by default (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
)</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>raw dataset as DataFrame</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.merge_similarity_duplicates"><code class="name flex">
<span>def <span class="ident">merge_similarity_duplicates</span></span>(<span>data: DataFrame, model: SentenceTransformer, cos_sim_threshold: float) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def merge_similarity_duplicates(
data: DataFrame,
model: SentenceTransformer,
cos_sim_threshold: float,
) -&gt; tuple[DataFrame]:
logger.info(&#39;Start merging of similarity candidates...&#39;)
# data
merged_data = data.copy()
model_input = merged_data[&#39;entry&#39;]
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
by=[&#39;num_occur&#39;, &#39;num_assoc_obj_ids&#39;, &#39;len&#39;],
ascending=[False, False, False],
)
# merge information to first entry
data_idx = cast(PandasIndex, similar_data.index[0])
similar_data.at[data_idx, &#39;num_occur&#39;] = similar_data[&#39;num_occur&#39;].sum()
assoc_obj_ids = similar_data[&#39;assoc_obj_ids&#39;].to_numpy()
assoc_obj_ids = np.concatenate(assoc_obj_ids)
assoc_obj_ids = np.unique(assoc_obj_ids)
similar_data.at[data_idx, &#39;assoc_obj_ids&#39;] = assoc_obj_ids
similar_data.at[data_idx, &#39;num_assoc_obj_ids&#39;] = len(assoc_obj_ids)
# remaining indices, should be removed
similar_id_group.remove(data_idx)
merged_similar_data = similar_data.drop(index=similar_id_group)
# update entry in main dataset, drop remaining entries
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info(&#39;Similarity candidates merged successfully.&#39;)
return (merged_data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.preprocessing.numeric_pre_filter_feature"><code class="name flex">
<span>def <span class="ident">numeric_pre_filter_feature</span></span>(<span>data: DataFrame, feature: str, bound_lower: int | None, bound_upper: int | None) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def numeric_pre_filter_feature(
data: DataFrame,
feature: str,
bound_lower: int | None,
bound_upper: int | None,
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;filter DataFrame for a given numerical feature regarding their bounds
bounds are inclusive: entries (bound_lower &lt;= entry &lt;= bound_upper) are retained
Parameters
----------
data : DataFrame
DataFrame to filter
feature : str
feature name to filter
bound_lower : int | None
lower bound of values to retain
bound_upper : int | None
upper bound of values to retain
Returns
-------
tuple[DataFrame]
filtered DataFrame
Raises
------
ValueError
if no bounds are provided, at least one bound must be set
&#34;&#34;&#34;
if not any([bound_lower, bound_upper]):
raise ValueError(&#39;No bounds for filtering provided&#39;)
data = data.copy()
if bound_lower is None:
bound_lower = cast(int, data[feature].min())
if bound_upper is None:
bound_upper = cast(int, data[feature].max())
filter_lower = data[feature] &gt;= bound_lower
filter_upper = data[feature] &lt;= bound_upper
filter = filter_lower &amp; filter_upper
data = data.loc[filter]
return (data,)</code></pre>
</details>
<div class="desc"><p>filter DataFrame for a given numerical feature regarding their bounds
bounds are inclusive: entries (bound_lower &lt;= entry &lt;= bound_upper) are retained</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>DataFrame to filter</dd>
<dt><strong><code>feature</code></strong> :&ensp;<code>str</code></dt>
<dd>feature name to filter</dd>
<dt><strong><code>bound_lower</code></strong> :&ensp;<code>int | None</code></dt>
<dd>lower bound of values to retain</dd>
<dt><strong><code>bound_upper</code></strong> :&ensp;<code>int | None</code></dt>
<dd>upper bound of values to retain</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>tuple[DataFrame]</code></dt>
<dd>filtered DataFrame</dd>
</dl>
<h2 id="raises">Raises</h2>
<dl>
<dt><code>ValueError</code></dt>
<dd>if no bounds are provided, at least one bound must be set</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.remove_NA"><code class="name flex">
<span>def <span class="ident">remove_NA</span></span>(<span>data: DataFrame, target_features: Collection[str] = ('VorgangsBeschreibung',)) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def remove_NA(
data: DataFrame,
target_features: Collection[str] = (&#39;VorgangsBeschreibung&#39;,),
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;function to drop NA entries based on a subset of features to be analysed
Parameters
----------
data : DataFrame
standard IHM dataset, perhaps pre-cleaned
target_features : Collection[str], optional
subset to analyse to define an NA entry, by default (&#39;VorgangsBeschreibung&#39;,)
Returns
-------
DataFrame
dataset with removed NA entries for given subset of features
&#34;&#34;&#34;
target_features = list(target_features)
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(
f&#39;Removed NA entries for features &gt;&gt;{target_features}&lt;&lt; from dataset successfully.&#39;
)
return (wo_NA,)</code></pre>
</details>
<div class="desc"><p>function to drop NA entries based on a subset of features to be analysed</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>standard IHM dataset, perhaps pre-cleaned</dd>
<dt><strong><code>target_features</code></strong> :&ensp;<code>Collection[str]</code>, optional</dt>
<dd>subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>dataset with removed NA entries for given subset of features</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.remove_duplicates"><code class="name flex">
<span>def <span class="ident">remove_duplicates</span></span>(<span>data: DataFrame) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def remove_duplicates(
data: DataFrame,
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;removes duplicated entries over all features in the given dataset
Parameters
----------
data : DataFrame
read data with standard structure
Returns
-------
DataFrame
dataset with removed duplicates over all features
&#34;&#34;&#34;
# obtain info about duplicates over all features
duplicates_filt = data.duplicated()
logger.info(f&#39;Number of duplicates over all features: {duplicates_filt.sum()}&#39;)
# drop duplicates
wo_duplicates = data.drop_duplicates(ignore_index=True)
duplicates_subset: list[str] = [
&#39;VorgangsID&#39;,
&#39;ObjektID&#39;,
]
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
logger.info(
(
&#39;Number of duplicates over subset &#39;
f&#39;&gt;&gt;{duplicates_subset}&lt;&lt;: {duplicates_subset_filt.sum()}&#39;
)
)
wo_duplicates = wo_duplicates.drop_duplicates(
subset=duplicates_subset, ignore_index=True
).copy()
logger.info(&#39;Removed all duplicates from dataset successfully.&#39;)
logger.info(
&#39;New Dataset properties: number of entries: %d, number of features %d&#39;,
len(wo_duplicates),
len(wo_duplicates.columns),
)
return (wo_duplicates,)</code></pre>
</details>
<div class="desc"><p>removes duplicated entries over all features in the given dataset</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>read data with standard structure</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>dataset with removed duplicates over all features</dd>
</dl></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.preprocessing.analyse_feature" href="#lang_main.analysis.preprocessing.analyse_feature">analyse_feature</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.load_raw_data" href="#lang_main.analysis.preprocessing.load_raw_data">load_raw_data</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.merge_similarity_duplicates" href="#lang_main.analysis.preprocessing.merge_similarity_duplicates">merge_similarity_duplicates</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.numeric_pre_filter_feature" href="#lang_main.analysis.preprocessing.numeric_pre_filter_feature">numeric_pre_filter_feature</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.remove_NA" href="#lang_main.analysis.preprocessing.remove_NA">remove_NA</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.remove_duplicates" href="#lang_main.analysis.preprocessing.remove_duplicates">remove_duplicates</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>

View File

@@ -0,0 +1,273 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.shared API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.shared</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.shared.candidates_by_index"><code class="name flex">
<span>def <span class="ident">candidates_by_index</span></span>(<span>data_model_input: pandas.core.series.Series,<br>model: sentence_transformers.SentenceTransformer.SentenceTransformer,<br>cos_sim_threshold: float = 0.5) > Iterator[tuple[int | numpy.int64, int | numpy.int64]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
) -&gt; Iterator[tuple[PandasIndex, PandasIndex]]:
&#34;&#34;&#34;function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset
Parameters
----------
obj_id : ObjectID
_description_
data_model_input : Series
containing indices and text entries to process
model : SentenceTransformer
necessary SentenceTransformer model to encode text entries
cos_sim_threshold : float, optional
threshold for cosine similarity to filter candidates, by default 0.5
Yields
------
Iterator[tuple[PandasIndex, PandasIndex]]
tuple of index pairs which meet the cosine similarity threshold
&#34;&#34;&#34;
# embeddings
batch = cast(list[str], data_model_input.to_list())
embds = cast(
Tensor,
model.encode(
batch,
convert_to_numpy=False,
convert_to_tensor=True,
show_progress_bar=False,
),
)
# cosine similarity
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
np.fill_diagonal(cos_sim, 0.0)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim &gt;= cos_sim_threshold)
for idx_array in cos_sim_idx:
idx_pair = cast(
tuple[np.int64, np.int64], tuple(data_model_input.index[idx] for idx in idx_array)
)
yield idx_pair</code></pre>
</details>
<div class="desc"><p>function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
feed data as Series to retain information about indices of entries and
access them later in the original dataset</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>obj_id</code></strong> :&ensp;<code>ObjectID</code></dt>
<dd><em>description</em></dd>
<dt><strong><code>data_model_input</code></strong> :&ensp;<code>Series</code></dt>
<dd>containing indices and text entries to process</dd>
<dt><strong><code>model</code></strong> :&ensp;<code>SentenceTransformer</code></dt>
<dd>necessary SentenceTransformer model to encode text entries</dd>
<dt><strong><code>cos_sim_threshold</code></strong> :&ensp;<code>float</code>, optional</dt>
<dd>threshold for cosine similarity to filter candidates, by default 0.5</dd>
</dl>
<h2 id="yields">Yields</h2>
<dl>
<dt><code>Iterator[tuple[PandasIndex, PandasIndex]]</code></dt>
<dd>tuple of index pairs which meet the cosine similarity threshold</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.clean_string_slim"><code class="name flex">
<span>def <span class="ident">clean_string_slim</span></span>(<span>string: str) > str</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def clean_string_slim(string: str) -&gt; str:
&#34;&#34;&#34;mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
&#34;&#34;&#34;
# remove special chars
# string = pattern_escape_newline.sub(&#39; &#39;, string)
string = pattern_escape_seq.sub(&#39; &#39;, string)
string = pattern_repeated_chars.sub(&#39;&#39;, string)
# string = pattern_dates.sub(&#39;&#39;, string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(&#39; &#39;, string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string</code></pre>
</details>
<div class="desc"><p>mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
<dd>dataset entry feature</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>str</code></dt>
<dd>cleaned entry</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.shared.entry_wise_cleansing"><code class="name flex">
<span>def <span class="ident">entry_wise_cleansing</span></span>(<span>data: pandas.core.frame.DataFrame,<br>target_features: Collection[str],<br>cleansing_func: Callable[[str], str] = &lt;function clean_string_slim&gt;) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def entry_wise_cleansing(
data: DataFrame,
target_features: Collection[str],
cleansing_func: Callable[[str], str] = clean_string_slim,
) -&gt; tuple[DataFrame]:
# apply given cleansing function to target feature
target_features = list(target_features)
data[target_features] = data[target_features].map(cleansing_func)
logger.info(
(&#39;Successfully applied entry-wise cleansing procedure &gt;&gt;%s&lt;&lt; for features &gt;&gt;%s&lt;&lt;&#39;),
cleansing_func.__name__,
target_features,
)
return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_connection_graph"><code class="name flex">
<span>def <span class="ident">similar_index_connection_graph</span></span>(<span>similar_idx_pairs: Iterable[tuple[int | numpy.int64, int | numpy.int64]]) > tuple[networkx.classes.graph.Graph, dict[str, float]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_connection_graph(
similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
) -&gt; tuple[Graph, dict[str, float]]:
# build index graph to obtain graph of connected (similar) indices
# use this graph to get connected components (indices which belong together)
# retain semantic connection on whole dataset
similar_id_graph = nx.Graph()
# for idx1, idx2 in similar_idx_pairs:
# # inplace operation, parent/child do not really exist in undirected graph
# update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
update_graph(graph=similar_id_graph, batch=similar_idx_pairs)
graph_info = get_graph_metadata(graph=similar_id_graph, logging=False)
return similar_id_graph, graph_info</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.shared.similar_index_groups"><code class="name flex">
<span>def <span class="ident">similar_index_groups</span></span>(<span>similar_id_graph: networkx.classes.graph.Graph) > Iterator[tuple[int | numpy.int64, ...]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def similar_index_groups(
similar_id_graph: Graph,
) -&gt; Iterator[tuple[PandasIndex, ...]]:
# groups of connected indices
ids_groups = cast(Iterator[set[PandasIndex]], nx.connected_components(G=similar_id_graph))
for id_group in ids_groups:
yield tuple(id_group)</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.shared.candidates_by_index" href="#lang_main.analysis.shared.candidates_by_index">candidates_by_index</a></code></li>
<li><code><a title="lang_main.analysis.shared.clean_string_slim" href="#lang_main.analysis.shared.clean_string_slim">clean_string_slim</a></code></li>
<li><code><a title="lang_main.analysis.shared.entry_wise_cleansing" href="#lang_main.analysis.shared.entry_wise_cleansing">entry_wise_cleansing</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_connection_graph" href="#lang_main.analysis.shared.similar_index_connection_graph">similar_index_connection_graph</a></code></li>
<li><code><a title="lang_main.analysis.shared.similar_index_groups" href="#lang_main.analysis.shared.similar_index_groups">similar_index_groups</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>

View File

@@ -0,0 +1,333 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.timeline API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.timeline</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.timeline.calc_delta_to_next_failure"><code class="name flex">
<span>def <span class="ident">calc_delta_to_next_failure</span></span>(<span>data: pandas.core.frame.DataFrame,<br>date_feature: str = 'ErstellungsDatum',<br>name_delta_feature: str = 'Zeitspanne bis zum nächsten Ereignis [Tage]',<br>convert_to_days: bool = True) > pandas.core.frame.DataFrame</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def calc_delta_to_next_failure(
data: DataFrameTLFiltered,
date_feature: str = &#39;ErstellungsDatum&#39;,
name_delta_feature: str = NAME_DELTA_FEAT_TO_NEXT_FAILURE,
convert_to_days: bool = True,
) -&gt; DataFrameTLFiltered:
data = data.copy()
last_val = data[date_feature].iat[-1]
shifted = data[date_feature].shift(-1, fill_value=last_val)
data[name_delta_feature] = shifted - data[date_feature]
data = data.sort_values(by=name_delta_feature, ascending=False)
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
return data</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.calc_delta_to_repair"><code class="name flex">
<span>def <span class="ident">calc_delta_to_repair</span></span>(<span>data: pandas.core.frame.DataFrame,<br>date_feature_start: str = 'ErstellungsDatum',<br>date_feature_end: str = 'ErledigungsDatum',<br>name_delta_feature: str = 'Zeitspanne bis zur Behebung [Tage]',<br>convert_to_days: bool = True) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def calc_delta_to_repair(
data: DataFrame,
date_feature_start: str = &#39;ErstellungsDatum&#39;,
date_feature_end: str = &#39;ErledigungsDatum&#39;,
name_delta_feature: str = NAME_DELTA_FEAT_TO_REPAIR,
convert_to_days: bool = True,
) -&gt; tuple[DataFrame]:
logger.info(&#39;Calculating time differences between start and end of operations...&#39;)
data = data.copy()
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
logger.info(&#39;Calculation successful.&#39;)
return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.cleanup_descriptions"><code class="name flex">
<span>def <span class="ident">cleanup_descriptions</span></span>(<span>data: pandas.core.frame.DataFrame,<br>properties: Collection[str] = ('VorgangsBeschreibung', 'ErledigungsBeschreibung')) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def cleanup_descriptions(
data: DataFrame,
properties: Collection[str] = (
&#39;VorgangsBeschreibung&#39;,
&#39;ErledigungsBeschreibung&#39;,
),
) -&gt; tuple[DataFrame]:
logger.info(&#39;Cleaning necessary descriptions...&#39;)
data = data.copy()
features = list(properties)
data[features] = data[features].fillna(&#39;N.V.&#39;)
(data,) = entry_wise_cleansing(data, target_features=features)
logger.info(&#39;Cleansing successful.&#39;)
return (data.copy(),)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.filter_activities_per_obj_id"><code class="name flex">
<span>def <span class="ident">filter_activities_per_obj_id</span></span>(<span>data: pandas.core.frame.DataFrame,<br>activity_feature: str = 'VorgangsTypName',<br>relevant_activity_types: Iterable[str] = ('Reparaturauftrag (Portal)',),<br>feature_obj_id: str = 'ObjektID',<br>threshold_num_activities: int = 1) > tuple[pandas.core.frame.DataFrame, pandas.core.series.Series]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def filter_activities_per_obj_id(
data: DataFrame,
activity_feature: str = &#39;VorgangsTypName&#39;,
relevant_activity_types: Iterable[str] = (&#39;Reparaturauftrag (Portal)&#39;,),
feature_obj_id: str = &#39;ObjektID&#39;,
threshold_num_activities: int = 1,
) -&gt; tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities, count occurrences for each ObjectID
logger.info(&#39;Filtering activities per ObjectID...&#39;)
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
Series, data_filter_activities[feature_obj_id].value_counts(sort=True)
)
# filter for ObjectIDs with more than given number of activities
filt_below_thresh = num_activities_per_obj_id &lt;= threshold_num_activities
# index of series contains ObjectIDs
obj_ids_below_thresh = num_activities_per_obj_id[filt_below_thresh].index
filt_entries_below_thresh = data_filter_activities[feature_obj_id].isin(
obj_ids_below_thresh
)
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info(&#39;Activities per ObjectID filtered successfully.&#39;)
return data_filter_activities, num_activities_per_obj_id</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.filter_timeline_cands"><code class="name flex">
<span>def <span class="ident">filter_timeline_cands</span></span>(<span>data: pandas.core.frame.DataFrame,<br>cands: dict[int, tuple[tuple[int | numpy.int64, ...], ...]],<br>obj_id: int,<br>entry_idx: int,<br>sort_feature: str = 'ErstellungsDatum') > pandas.core.frame.DataFrame</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def filter_timeline_cands(
data: DataFrame,
cands: TimelineCandidates,
obj_id: ObjectID,
entry_idx: int,
sort_feature: str = &#39;ErstellungsDatum&#39;,
) -&gt; DataFrameTLFiltered:
data = data.copy()
cands_for_obj_id = cands[obj_id]
cands_choice = cands_for_obj_id[entry_idx]
data = data.loc[list(cands_choice)].sort_values(
by=sort_feature,
ascending=True,
)
return data</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.generate_model_input"><code class="name flex">
<span>def <span class="ident">generate_model_input</span></span>(<span>data: pandas.core.frame.DataFrame,<br>target_feature_name: str = 'nlp_model_input',<br>model_input_features: Iterable[str] = ('VorgangsTypName', 'VorgangsArtText', 'VorgangsBeschreibung')) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def generate_model_input(
data: DataFrame,
target_feature_name: str = &#39;nlp_model_input&#39;,
model_input_features: Iterable[str] = (
&#39;VorgangsTypName&#39;,
&#39;VorgangsArtText&#39;,
&#39;VorgangsBeschreibung&#39;,
),
) -&gt; tuple[DataFrame]:
logger.info(&#39;Generating concatenation of model input features...&#39;)
data = data.copy()
model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna(&#39;&#39;).astype(str)
data[target_feature_name] = input_features.apply(
lambda x: &#39; - &#39;.join(x),
axis=1,
)
logger.info(&#39;Model input generated successfully.&#39;)
return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.get_timeline_candidates"><code class="name flex">
<span>def <span class="ident">get_timeline_candidates</span></span>(<span>data: pandas.core.frame.DataFrame,<br>num_activities_per_obj_id: pandas.core.series.Series,<br>*,<br>model: sentence_transformers.SentenceTransformer.SentenceTransformer,<br>cos_sim_threshold: float,<br>feature_obj_id: str = 'ObjektID',<br>feature_obj_text: str = 'HObjektText',<br>model_input_feature: str = 'nlp_model_input') > tuple[dict[int, tuple[tuple[int | numpy.int64, ...], ...]], dict[int, str]]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def get_timeline_candidates(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
model: SentenceTransformer,
cos_sim_threshold: float,
feature_obj_id: str = &#39;ObjektID&#39;,
feature_obj_text: str = &#39;HObjektText&#39;,
model_input_feature: str = &#39;nlp_model_input&#39;,
) -&gt; tuple[TimelineCandidates, dict[ObjectID, str]]:
logger.info(&#39;Obtaining timeline candidates...&#39;)
candidates = _get_timeline_candidates_index(
data=data,
num_activities_per_obj_id=num_activities_per_obj_id,
model=model,
cos_sim_threshold=cos_sim_threshold,
feature_obj_id=feature_obj_id,
model_input_feature=model_input_feature,
)
tl_candidates = _transform_timeline_candidates(candidates)
logger.info(&#39;Timeline candidates obtained successfully.&#39;)
# text mapping to obtain object descriptors
logger.info(&#39;Mapping ObjectIDs to their respective text descriptor...&#39;)
map_obj_text = _map_obj_id_to_texts(
data=data,
feature_obj_id=feature_obj_id,
feature_obj_text=feature_obj_text,
)
logger.info(&#39;ObjectIDs successfully mapped to text descriptors.&#39;)
return tl_candidates, map_obj_text</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.timeline.remove_non_relevant_obj_ids"><code class="name flex">
<span>def <span class="ident">remove_non_relevant_obj_ids</span></span>(<span>data: pandas.core.frame.DataFrame,<br>thresh_unique_feat_per_id: int,<br>*,<br>feature_uniqueness: str = 'HObjektText',<br>feature_obj_id: str = 'ObjektID') > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def remove_non_relevant_obj_ids(
data: DataFrame,
thresh_unique_feat_per_id: int,
*,
feature_uniqueness: str = &#39;HObjektText&#39;,
feature_obj_id: str = &#39;ObjektID&#39;,
) -&gt; tuple[DataFrame]:
logger.info(&#39;Removing non-relevant ObjectIDs from dataset...&#39;)
data = data.copy()
ids_to_ignore = _non_relevant_obj_ids(
data=data,
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
feature_uniqueness=feature_uniqueness,
feature_obj_id=feature_obj_id,
)
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug(&#39;Ignored ObjectIDs: %s&#39;, ids_to_ignore)
logger.info(&#39;Non-relevant ObjectIDs removed successfully.&#39;)
return (data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.timeline.calc_delta_to_next_failure" href="#lang_main.analysis.timeline.calc_delta_to_next_failure">calc_delta_to_next_failure</a></code></li>
<li><code><a title="lang_main.analysis.timeline.calc_delta_to_repair" href="#lang_main.analysis.timeline.calc_delta_to_repair">calc_delta_to_repair</a></code></li>
<li><code><a title="lang_main.analysis.timeline.cleanup_descriptions" href="#lang_main.analysis.timeline.cleanup_descriptions">cleanup_descriptions</a></code></li>
<li><code><a title="lang_main.analysis.timeline.filter_activities_per_obj_id" href="#lang_main.analysis.timeline.filter_activities_per_obj_id">filter_activities_per_obj_id</a></code></li>
<li><code><a title="lang_main.analysis.timeline.filter_timeline_cands" href="#lang_main.analysis.timeline.filter_timeline_cands">filter_timeline_cands</a></code></li>
<li><code><a title="lang_main.analysis.timeline.generate_model_input" href="#lang_main.analysis.timeline.generate_model_input">generate_model_input</a></code></li>
<li><code><a title="lang_main.analysis.timeline.get_timeline_candidates" href="#lang_main.analysis.timeline.get_timeline_candidates">get_timeline_candidates</a></code></li>
<li><code><a title="lang_main.analysis.timeline.remove_non_relevant_obj_ids" href="#lang_main.analysis.timeline.remove_non_relevant_obj_ids">remove_non_relevant_obj_ids</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>

View File

@@ -0,0 +1,320 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.tokens API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.tokens</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.tokens.add_doc_info_to_graph"><code class="name flex">
<span>def <span class="ident">add_doc_info_to_graph</span></span>(<span>graph: <a title="lang_main.analysis.graphs.TokenGraph" href="graphs.html#lang_main.analysis.graphs.TokenGraph">TokenGraph</a>,<br>doc: spacy.tokens.doc.Doc,<br>weight: int | None) > None</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def add_doc_info_to_graph(
graph: TokenGraph,
doc: SpacyDoc,
weight: int | None,
) -&gt; None:
# iterate over sentences
for sent in doc.sents:
# iterate over tokens in sentence
for token in sent:
# skip tokens which are not relevant
if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
continue
# skip token which are dates or times
if token.pos_ == &#39;NUM&#39; and is_str_date(string=token.text):
continue
relevant_descendants = obtain_relevant_descendants(token=token)
# for non-AUX: add parent &lt;--&gt; descendant pair to graph
if token.pos_ not in POS_INDIRECT:
for descendant in relevant_descendants:
# add descendant and parent to graph
update_graph(
graph=graph,
parent=token.lemma_,
child=descendant.lemma_,
weight_connection=weight,
)
else:
# if indirect POS, make connection between all associated words
combs = combinations(relevant_descendants, r=2)
for comb in combs:
# !! parents and children do not really exist in this case,
# !! but only one connection is made
update_graph(
graph=graph,
parent=comb[0].lemma_,
child=comb[1].lemma_,
weight_connection=weight,
)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.build_token_graph"><code class="name flex">
<span>def <span class="ident">build_token_graph</span></span>(<span>data: pandas.core.frame.DataFrame,<br>model: spacy.language.Language,<br>*,<br>target_feature: str = 'entry',<br>weights_feature: str | None = None,<br>batch_idx_feature: str | None = 'batched_idxs',<br>build_map: bool = True,<br>batch_size_model: int = 50,<br>logging_graph: bool = True) > tuple[<a title="lang_main.analysis.graphs.TokenGraph" href="graphs.html#lang_main.analysis.graphs.TokenGraph">TokenGraph</a>, dict[int | numpy.int64, spacy.tokens.doc.Doc] | None]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_token_graph(
data: DataFrame,
model: SpacyModel,
*,
target_feature: str = &#39;entry&#39;,
weights_feature: str | None = None,
batch_idx_feature: str | None = &#39;batched_idxs&#39;,
build_map: bool = True,
batch_size_model: int = 50,
logging_graph: bool = True,
) -&gt; tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
graph = TokenGraph(enable_logging=logging_graph)
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
if weights_feature is not None:
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
else:
weights = None
docs_mapping: dict[PandasIndex, SpacyDoc] | None
if build_map and batch_idx_feature is None:
raise ValueError(&#39;Can not build mapping if batched indices are unknown.&#39;)
elif build_map:
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
docs_mapping = {}
else:
indices = None
docs_mapping = None
index: int = 0
for doc in tqdm(
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
):
weight: int | None = None
if weights is not None:
weight = weights[index]
add_doc_info_to_graph(
graph=graph,
doc=doc,
weight=weight,
)
# build map if option chosen
if indices is not None and docs_mapping is not None:
corresponding_indices = indices[index]
for idx in corresponding_indices:
docs_mapping[idx] = doc
index += 1
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected(logging=False)
graph.perform_static_analysis()
return graph, docs_mapping</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.is_str_date"><code class="name flex">
<span>def <span class="ident">is_str_date</span></span>(<span>string: str, fuzzy: bool = False) > bool</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def is_str_date(
string: str,
fuzzy: bool = False,
) -&gt; bool:
&#34;&#34;&#34;not stable function to test strings for dates, not 100 percent reliable
Parameters
----------
string : str
string to check for dates
fuzzy : bool, optional
whether to use dateutils.parser.pase fuzzy capability, by default False
Returns
-------
bool
indicates whether date was found or not
&#34;&#34;&#34;
try:
# check if string is a number
# if length is greater than 8, it is not a date
int(string)
if len(string) not in {2, 4}:
return False
except ValueError:
# not a number
pass
try:
parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False)
return True
except ValueError:
date_found: bool = False
match = pattern_dates.search(string)
if match is None:
return date_found
date_found = any(match.groups())
return date_found</code></pre>
</details>
<div class="desc"><p>not stable function to test strings for dates, not 100 percent reliable</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
<dd>string to check for dates</dd>
<dt><strong><code>fuzzy</code></strong> :&ensp;<code>bool</code>, optional</dt>
<dd>whether to use dateutils.parser.pase fuzzy capability, by default False</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>bool</code></dt>
<dd>indicates whether date was found or not</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.tokens.obtain_relevant_descendants"><code class="name flex">
<span>def <span class="ident">obtain_relevant_descendants</span></span>(<span>token: spacy.tokens.token.Token) > Iterator[spacy.tokens.token.Token]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def obtain_relevant_descendants(
token: SpacyToken,
) -&gt; Iterator[SpacyToken]:
for descendant in token.subtree:
# subtrees contain the token itself
# if current element is token skip this element
if descendant == token:
continue
# if descendant is a date skip it)
if is_str_date(string=descendant.text):
continue
logger.debug(
&#39;Token &gt;&gt;%s&lt;&lt;, POS &gt;&gt;%s&lt;&lt; | descendant &gt;&gt;%s&lt;&lt;, POS &gt;&gt;%s&lt;&lt;&#39;,
token,
token.pos_,
descendant,
descendant.pos_,
)
# eliminate cases of cross-references with verbs
if (token.pos_ == &#39;AUX&#39; or token.pos_ == &#39;VERB&#39;) and (
descendant.pos_ == &#39;AUX&#39; or descendant.pos_ == &#39;VERB&#39;
):
continue
# skip cases in which descendant is indirect POS with others than verbs
elif descendant.pos_ in POS_INDIRECT:
continue
# skip cases in which child has no relevant POS or TAG
elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
continue
yield descendant
# TODO look at results and fine-tune function accordingly</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.pre_clean_word"><code class="name flex">
<span>def <span class="ident">pre_clean_word</span></span>(<span>string: str) > str</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def pre_clean_word(string: str) -&gt; str:
pattern = r&#39;[^A-Za-zäöüÄÖÜ]+&#39;
string = re.sub(pattern, &#39;&#39;, string)
return string</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.tokens.add_doc_info_to_graph" href="#lang_main.analysis.tokens.add_doc_info_to_graph">add_doc_info_to_graph</a></code></li>
<li><code><a title="lang_main.analysis.tokens.build_token_graph" href="#lang_main.analysis.tokens.build_token_graph">build_token_graph</a></code></li>
<li><code><a title="lang_main.analysis.tokens.is_str_date" href="#lang_main.analysis.tokens.is_str_date">is_str_date</a></code></li>
<li><code><a title="lang_main.analysis.tokens.obtain_relevant_descendants" href="#lang_main.analysis.tokens.obtain_relevant_descendants">obtain_relevant_descendants</a></code></li>
<li><code><a title="lang_main.analysis.tokens.pre_clean_word" href="#lang_main.analysis.tokens.pre_clean_word">pre_clean_word</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>