lang-main/docs/lang_main/analysis/preprocessing.html
2025-01-22 16:54:15 +01:00

452 lines
21 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.preprocessing API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.preprocessing</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.preprocessing.analyse_feature"><code class="name flex">
<span>def <span class="ident">analyse_feature</span></span>(<span>data: DataFrame, target_feature: str) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def analyse_feature(
data: DataFrame,
target_feature: str,
) -&gt; tuple[DataFrame]:
# feature columns
feature_entries = data[target_feature]
logger.info(
&#39;Number of entries for feature &gt;&gt;%s&lt;&lt;: %d&#39;, target_feature, len(feature_entries)
)
# obtain unique entries
unique_feature_entries = feature_entries.unique()
# prepare result DataFrame
cols = [&#39;batched_idxs&#39;, &#39;entry&#39;, &#39;len&#39;, &#39;num_occur&#39;, &#39;assoc_obj_ids&#39;, &#39;num_assoc_obj_ids&#39;]
result_df = pd.DataFrame(columns=cols)
for entry in tqdm(unique_feature_entries, mininterval=1.0):
len_entry = len(entry)
filt = data[target_feature] == entry
temp = data[filt]
batched_idxs = temp.index.to_numpy()
assoc_obj_ids = temp[&#39;ObjektID&#39;].unique()
assoc_obj_ids = np.sort(assoc_obj_ids, kind=&#39;stable&#39;)
num_assoc_obj_ids = len(assoc_obj_ids)
num_dupl = filt.sum()
conc_df = pd.DataFrame(
data=[
[batched_idxs, entry, len_entry, num_dupl, assoc_obj_ids, num_assoc_obj_ids]
],
columns=cols,
)
result_df = pd.concat([result_df, conc_df], ignore_index=True)
result_df = result_df.sort_values(
by=[&#39;num_occur&#39;, &#39;len&#39;], ascending=[False, False]
).copy()
return (result_df,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.preprocessing.load_raw_data"><code class="name flex">
<span>def <span class="ident">load_raw_data</span></span>(<span>path: Path,<br>date_cols: Collection[str] = ('VorgangsDatum', 'ErledigungsDatum', 'Arbeitsbeginn', 'ErstellungsDatum')) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def load_raw_data(
path: Path,
date_cols: Collection[str] = (
&#39;VorgangsDatum&#39;,
&#39;ErledigungsDatum&#39;,
&#39;Arbeitsbeginn&#39;,
&#39;ErstellungsDatum&#39;,
),
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;load IHM dataset with standard structure
Parameters
----------
path : str
path to dataset file, usually CSV file
date_cols : Collection[str], optional
columns which contain dates and are parsed as such,
by default (
&#39;VorgangsDatum&#39;,
&#39;ErledigungsDatum&#39;,
&#39;Arbeitsbeginn&#39;,
&#39;ErstellungsDatum&#39;,
)
Returns
-------
DataFrame
raw dataset as DataFrame
&#34;&#34;&#34;
# load dataset
date_cols = list(date_cols)
data = pd.read_csv(
filepath_or_buffer=path,
sep=&#39;;&#39;,
encoding=&#39;cp1252&#39;,
parse_dates=list(date_cols),
dayfirst=True,
)
logger.info(&#39;Loaded dataset successfully.&#39;)
logger.info(
(
f&#39;Dataset properties: number of entries: {len(data)}, &#39;
f&#39;number of features {len(data.columns)}&#39;
)
)
return (data,)</code></pre>
</details>
<div class="desc"><p>load IHM dataset with standard structure</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>path</code></strong> :&ensp;<code>str</code></dt>
<dd>path to dataset file, usually CSV file</dd>
<dt><strong><code>date_cols</code></strong> :&ensp;<code>Collection[str]</code>, optional</dt>
<dd>columns which contain dates and are parsed as such,
by default (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
'ErstellungsDatum',
)</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>raw dataset as DataFrame</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.merge_similarity_duplicates"><code class="name flex">
<span>def <span class="ident">merge_similarity_duplicates</span></span>(<span>data: DataFrame, model: SentenceTransformer, cos_sim_threshold: float) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def merge_similarity_duplicates(
data: DataFrame,
model: SentenceTransformer,
cos_sim_threshold: float,
) -&gt; tuple[DataFrame]:
logger.info(&#39;Start merging of similarity candidates...&#39;)
# data
merged_data = data.copy()
model_input = merged_data[&#39;entry&#39;]
candidates_idx = candidates_by_index(
data_model_input=model_input,
model=model,
cos_sim_threshold=cos_sim_threshold,
)
# graph of similar ids
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
for similar_id_group in similar_index_groups(similar_id_graph):
similar_id_group = list(similar_id_group)
similar_data = merged_data.loc[similar_id_group, :]
# keep first entry with max number occurrences, then number of
# associated objects, then length of entry
similar_data = similar_data.sort_values(
by=[&#39;num_occur&#39;, &#39;num_assoc_obj_ids&#39;, &#39;len&#39;],
ascending=[False, False, False],
)
# merge information to first entry
data_idx = cast(PandasIndex, similar_data.index[0])
similar_data.at[data_idx, &#39;num_occur&#39;] = similar_data[&#39;num_occur&#39;].sum()
assoc_obj_ids = similar_data[&#39;assoc_obj_ids&#39;].to_numpy()
assoc_obj_ids = np.concatenate(assoc_obj_ids)
assoc_obj_ids = np.unique(assoc_obj_ids)
similar_data.at[data_idx, &#39;assoc_obj_ids&#39;] = assoc_obj_ids
similar_data.at[data_idx, &#39;num_assoc_obj_ids&#39;] = len(assoc_obj_ids)
# remaining indices, should be removed
similar_id_group.remove(data_idx)
merged_similar_data = similar_data.drop(index=similar_id_group)
# update entry in main dataset, drop remaining entries
merged_data.update(merged_similar_data)
merged_data = merged_data.drop(index=similar_id_group)
logger.info(&#39;Similarity candidates merged successfully.&#39;)
return (merged_data,)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.preprocessing.numeric_pre_filter_feature"><code class="name flex">
<span>def <span class="ident">numeric_pre_filter_feature</span></span>(<span>data: DataFrame, feature: str, bound_lower: int | None, bound_upper: int | None) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def numeric_pre_filter_feature(
data: DataFrame,
feature: str,
bound_lower: int | None,
bound_upper: int | None,
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;filter DataFrame for a given numerical feature regarding their bounds
bounds are inclusive: entries (bound_lower &lt;= entry &lt;= bound_upper) are retained
Parameters
----------
data : DataFrame
DataFrame to filter
feature : str
feature name to filter
bound_lower : int | None
lower bound of values to retain
bound_upper : int | None
upper bound of values to retain
Returns
-------
tuple[DataFrame]
filtered DataFrame
Raises
------
ValueError
if no bounds are provided, at least one bound must be set
&#34;&#34;&#34;
if not any([bound_lower, bound_upper]):
raise ValueError(&#39;No bounds for filtering provided&#39;)
data = data.copy()
if bound_lower is None:
bound_lower = cast(int, data[feature].min())
if bound_upper is None:
bound_upper = cast(int, data[feature].max())
filter_lower = data[feature] &gt;= bound_lower
filter_upper = data[feature] &lt;= bound_upper
filter = filter_lower &amp; filter_upper
data = data.loc[filter]
return (data,)</code></pre>
</details>
<div class="desc"><p>filter DataFrame for a given numerical feature regarding their bounds
bounds are inclusive: entries (bound_lower &lt;= entry &lt;= bound_upper) are retained</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>DataFrame to filter</dd>
<dt><strong><code>feature</code></strong> :&ensp;<code>str</code></dt>
<dd>feature name to filter</dd>
<dt><strong><code>bound_lower</code></strong> :&ensp;<code>int | None</code></dt>
<dd>lower bound of values to retain</dd>
<dt><strong><code>bound_upper</code></strong> :&ensp;<code>int | None</code></dt>
<dd>upper bound of values to retain</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>tuple[DataFrame]</code></dt>
<dd>filtered DataFrame</dd>
</dl>
<h2 id="raises">Raises</h2>
<dl>
<dt><code>ValueError</code></dt>
<dd>if no bounds are provided, at least one bound must be set</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.remove_NA"><code class="name flex">
<span>def <span class="ident">remove_NA</span></span>(<span>data: DataFrame, target_features: Collection[str] = ('VorgangsBeschreibung',)) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def remove_NA(
data: DataFrame,
target_features: Collection[str] = (&#39;VorgangsBeschreibung&#39;,),
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;function to drop NA entries based on a subset of features to be analysed
Parameters
----------
data : DataFrame
standard IHM dataset, perhaps pre-cleaned
target_features : Collection[str], optional
subset to analyse to define an NA entry, by default (&#39;VorgangsBeschreibung&#39;,)
Returns
-------
DataFrame
dataset with removed NA entries for given subset of features
&#34;&#34;&#34;
target_features = list(target_features)
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(
f&#39;Removed NA entries for features &gt;&gt;{target_features}&lt;&lt; from dataset successfully.&#39;
)
return (wo_NA,)</code></pre>
</details>
<div class="desc"><p>function to drop NA entries based on a subset of features to be analysed</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>standard IHM dataset, perhaps pre-cleaned</dd>
<dt><strong><code>target_features</code></strong> :&ensp;<code>Collection[str]</code>, optional</dt>
<dd>subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>dataset with removed NA entries for given subset of features</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.preprocessing.remove_duplicates"><code class="name flex">
<span>def <span class="ident">remove_duplicates</span></span>(<span>data: DataFrame) > tuple[pandas.core.frame.DataFrame]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def remove_duplicates(
data: DataFrame,
) -&gt; tuple[DataFrame]:
&#34;&#34;&#34;removes duplicated entries over all features in the given dataset
Parameters
----------
data : DataFrame
read data with standard structure
Returns
-------
DataFrame
dataset with removed duplicates over all features
&#34;&#34;&#34;
# obtain info about duplicates over all features
duplicates_filt = data.duplicated()
logger.info(f&#39;Number of duplicates over all features: {duplicates_filt.sum()}&#39;)
# drop duplicates
wo_duplicates = data.drop_duplicates(ignore_index=True)
duplicates_subset: list[str] = [
&#39;VorgangsID&#39;,
&#39;ObjektID&#39;,
]
duplicates_subset_filt = wo_duplicates.duplicated(subset=duplicates_subset)
logger.info(
(
&#39;Number of duplicates over subset &#39;
f&#39;&gt;&gt;{duplicates_subset}&lt;&lt;: {duplicates_subset_filt.sum()}&#39;
)
)
wo_duplicates = wo_duplicates.drop_duplicates(
subset=duplicates_subset, ignore_index=True
).copy()
logger.info(&#39;Removed all duplicates from dataset successfully.&#39;)
logger.info(
&#39;New Dataset properties: number of entries: %d, number of features %d&#39;,
len(wo_duplicates),
len(wo_duplicates.columns),
)
return (wo_duplicates,)</code></pre>
</details>
<div class="desc"><p>removes duplicated entries over all features in the given dataset</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>data</code></strong> :&ensp;<code>DataFrame</code></dt>
<dd>read data with standard structure</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>DataFrame</code></dt>
<dd>dataset with removed duplicates over all features</dd>
</dl></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.preprocessing.analyse_feature" href="#lang_main.analysis.preprocessing.analyse_feature">analyse_feature</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.load_raw_data" href="#lang_main.analysis.preprocessing.load_raw_data">load_raw_data</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.merge_similarity_duplicates" href="#lang_main.analysis.preprocessing.merge_similarity_duplicates">merge_similarity_duplicates</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.numeric_pre_filter_feature" href="#lang_main.analysis.preprocessing.numeric_pre_filter_feature">numeric_pre_filter_feature</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.remove_NA" href="#lang_main.analysis.preprocessing.remove_NA">remove_NA</a></code></li>
<li><code><a title="lang_main.analysis.preprocessing.remove_duplicates" href="#lang_main.analysis.preprocessing.remove_duplicates">remove_duplicates</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>