lang-main/docs/lang_main/pipelines/predefined.html
2025-01-22 16:54:15 +01:00

387 lines
19 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.pipelines.predefined API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.pipelines.predefined</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.pipelines.predefined.build_base_target_feature_pipe"><code class="name flex">
<span>def <span class="ident">build_base_target_feature_pipe</span></span>(<span>) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_base_target_feature_pipe() -&gt; Pipeline:
pipe_target_feat = Pipeline(name=&#39;Target_Feature&#39;, working_dir=SAVE_PATH_FOLDER)
pipe_target_feat.add(
load_raw_data,
{
&#39;date_cols&#39;: DATE_COLS,
},
)
pipe_target_feat.add(remove_duplicates)
pipe_target_feat.add(remove_NA, save_result=True)
pipe_target_feat.add(
entry_wise_cleansing,
{
&#39;target_features&#39;: (TARGET_FEATURE,),
&#39;cleansing_func&#39;: clean_string_slim,
},
save_result=True,
filename=EntryPoints.TIMELINE,
)
pipe_target_feat.add(
analyse_feature,
{
&#39;target_feature&#39;: TARGET_FEATURE,
},
save_result=True,
)
return pipe_target_feat</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_merge_duplicates_pipe"><code class="name flex">
<span>def <span class="ident">build_merge_duplicates_pipe</span></span>(<span>) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_merge_duplicates_pipe() -&gt; Pipeline:
pipe_merge = Pipeline(name=&#39;Merge_Duplicates&#39;, working_dir=SAVE_PATH_FOLDER)
pipe_merge.add(
numeric_pre_filter_feature,
{
&#39;feature&#39;: &#39;len&#39;,
&#39;bound_lower&#39;: THRESHOLD_AMOUNT_CHARACTERS,
&#39;bound_upper&#39;: None,
},
)
pipe_merge.add(
merge_similarity_duplicates,
{
&#39;model&#39;: STFR_MODEL,
&#39;cos_sim_threshold&#39;: THRESHOLD_SIMILARITY,
},
save_result=True,
filename=EntryPoints.TOKEN_ANALYSIS,
)
return pipe_merge</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_timeline_pipe"><code class="name flex">
<span>def <span class="ident">build_timeline_pipe</span></span>(<span>) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_timeline_pipe() -&gt; Pipeline:
pipe_timeline = Pipeline(name=&#39;Timeline_Analysis&#39;, working_dir=SAVE_PATH_FOLDER)
pipe_timeline.add(
cleanup_descriptions,
{
&#39;properties&#39;: [&#39;ErledigungsBeschreibung&#39;],
},
)
pipe_timeline.add(
calc_delta_to_repair,
{
&#39;date_feature_start&#39;: &#39;ErstellungsDatum&#39;,
&#39;date_feature_end&#39;: &#39;ErledigungsDatum&#39;,
&#39;name_delta_feature&#39;: NAME_DELTA_FEAT_TO_REPAIR,
&#39;convert_to_days&#39;: True,
},
save_result=True,
filename=EntryPoints.TIMELINE_POST,
)
pipe_timeline.add(
remove_non_relevant_obj_ids,
{
&#39;thresh_unique_feat_per_id&#39;: THRESHOLD_UNIQUE_TEXTS,
&#39;feature_uniqueness&#39;: UNIQUE_CRITERION_FEATURE,
&#39;feature_obj_id&#39;: FEATURE_NAME_OBJ_ID,
},
save_result=True,
)
pipe_timeline.add(
generate_model_input,
{
&#39;target_feature_name&#39;: &#39;nlp_model_input&#39;,
&#39;model_input_features&#39;: MODEL_INPUT_FEATURES,
},
)
pipe_timeline.add(
filter_activities_per_obj_id,
{
&#39;activity_feature&#39;: ACTIVITY_FEATURE,
&#39;relevant_activity_types&#39;: ACTIVITY_TYPES,
&#39;feature_obj_id&#39;: FEATURE_NAME_OBJ_ID,
&#39;threshold_num_activities&#39;: THRESHOLD_NUM_ACTIVITIES,
},
)
pipe_timeline.add(
get_timeline_candidates,
{
&#39;model&#39;: STFR_MODEL,
&#39;cos_sim_threshold&#39;: THRESHOLD_TIMELINE_SIMILARITY,
&#39;feature_obj_id&#39;: FEATURE_NAME_OBJ_ID,
&#39;feature_obj_text&#39;: FEATURE_NAME_OBJ_TEXT,
&#39;model_input_feature&#39;: &#39;nlp_model_input&#39;,
},
save_result=True,
filename=EntryPoints.TIMELINE_CANDS,
)
return pipe_timeline</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_tk_graph_pipe"><code class="name flex">
<span>def <span class="ident">build_tk_graph_pipe</span></span>(<span>) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_tk_graph_pipe() -&gt; Pipeline:
pipe_token_analysis = Pipeline(name=&#39;Token_Analysis&#39;, working_dir=SAVE_PATH_FOLDER)
pipe_token_analysis.add(
build_token_graph,
{
&#39;model&#39;: SPACY_MODEL,
&#39;target_feature&#39;: &#39;entry&#39;,
&#39;weights_feature&#39;: &#39;num_occur&#39;,
&#39;batch_idx_feature&#39;: &#39;batched_idxs&#39;,
&#39;build_map&#39;: False,
&#39;batch_size_model&#39;: 50,
},
save_result=True,
filename=EntryPoints.TK_GRAPH_POST,
)
return pipe_token_analysis</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_tk_graph_post_pipe"><code class="name flex">
<span>def <span class="ident">build_tk_graph_post_pipe</span></span>(<span>) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_tk_graph_post_pipe() -&gt; Pipeline:
pipe_graph_postprocessing = Pipeline(
name=&#39;Graph_Postprocessing&#39;, working_dir=SAVE_PATH_FOLDER
)
pipe_graph_postprocessing.add(
graphs.filter_graph_by_number_edges,
{
&#39;limit&#39;: MAX_EDGE_NUMBER,
&#39;property&#39;: &#39;weight&#39;,
},
)
pipe_graph_postprocessing.add(
graphs.filter_graph_by_node_degree,
{
&#39;bound_lower&#39;: 1,
&#39;bound_upper&#39;: None,
},
)
pipe_graph_postprocessing.add(
graphs.static_graph_analysis,
save_result=True,
filename=EntryPoints.TK_GRAPH_ANALYSIS,
)
return pipe_graph_postprocessing</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_tk_graph_render_pipe"><code class="name flex">
<span>def <span class="ident">build_tk_graph_render_pipe</span></span>(<span>with_subgraphs: bool,<br>export_folder: pathlib.Path = WindowsPath('A:/Arbeitsaufgaben/lang-data/out'),<br>base_network_name: str = 'token_graph') > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_tk_graph_render_pipe(
with_subgraphs: bool,
export_folder: Path = SAVE_PATH_FOLDER,
base_network_name: str = CYTO_BASE_NETWORK_NAME,
) -&gt; Pipeline:
# optional dependency: late import
# raises exception if necessary modules are not found
try:
from lang_main.render import cytoscape as cyto
except ImportError:
raise ImportError(
(
&#39;Dependencies for Cytoscape interaction not found.&#39;
&#39;Install package with optional dependencies.&#39;
)
)
pipe_graph_rendering = Pipeline(
name=&#39;Graph_Static-Rendering&#39;,
working_dir=SAVE_PATH_FOLDER,
)
pipe_graph_rendering.add(
cyto.import_to_cytoscape,
{
&#39;network_name&#39;: base_network_name,
},
)
pipe_graph_rendering.add(
cyto.layout_network,
{
&#39;network_name&#39;: base_network_name,
},
)
pipe_graph_rendering.add(
cyto.apply_style_to_network,
{
&#39;network_name&#39;: base_network_name,
},
)
pipe_graph_rendering.add(
cyto.export_network_to_image,
{
&#39;filename&#39;: base_network_name,
&#39;target_folder&#39;: export_folder,
&#39;network_name&#39;: base_network_name,
},
)
if with_subgraphs:
pipe_graph_rendering.add(
cyto.get_subgraph_node_selection,
{
&#39;network_name&#39;: base_network_name,
},
)
pipe_graph_rendering.add(
cyto.build_subnetworks,
{
&#39;export_image&#39;: True,
&#39;target_folder&#39;: export_folder,
&#39;network_name&#39;: base_network_name,
},
)
return pipe_graph_rendering</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.pipelines.predefined.build_tk_graph_rescaling_pipe"><code class="name flex">
<span>def <span class="ident">build_tk_graph_rescaling_pipe</span></span>(<span>save_result: bool, exit_point: lang_main.types.EntryPoints) > <a title="lang_main.pipelines.base.Pipeline" href="base.html#lang_main.pipelines.base.Pipeline">Pipeline</a></span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_tk_graph_rescaling_pipe(
save_result: bool,
exit_point: EntryPoints,
) -&gt; Pipeline:
pipe_graph_rescaling = Pipeline(name=&#39;Graph_Rescaling&#39;, working_dir=SAVE_PATH_FOLDER)
pipe_graph_rescaling.add(
graphs.pipe_rescale_graph_edge_weights,
)
pipe_graph_rescaling.add(
graphs.pipe_add_graph_metrics,
save_result=save_result,
filename=exit_point,
)
return pipe_graph_rescaling</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.pipelines" href="index.html">lang_main.pipelines</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.pipelines.predefined.build_base_target_feature_pipe" href="#lang_main.pipelines.predefined.build_base_target_feature_pipe">build_base_target_feature_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_merge_duplicates_pipe" href="#lang_main.pipelines.predefined.build_merge_duplicates_pipe">build_merge_duplicates_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_timeline_pipe" href="#lang_main.pipelines.predefined.build_timeline_pipe">build_timeline_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_tk_graph_pipe" href="#lang_main.pipelines.predefined.build_tk_graph_pipe">build_tk_graph_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_tk_graph_post_pipe" href="#lang_main.pipelines.predefined.build_tk_graph_post_pipe">build_tk_graph_post_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_tk_graph_render_pipe" href="#lang_main.pipelines.predefined.build_tk_graph_render_pipe">build_tk_graph_render_pipe</a></code></li>
<li><code><a title="lang_main.pipelines.predefined.build_tk_graph_rescaling_pipe" href="#lang_main.pipelines.predefined.build_tk_graph_rescaling_pipe">build_tk_graph_rescaling_pipe</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>