2025-01-22 16:54:15 +01:00

321 lines
16 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<meta name="generator" content="pdoc3 0.11.5">
<title>lang_main.analysis.tokens API documentation</title>
<meta name="description" content="">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/sanitize.min.css" integrity="sha512-y1dtMcuvtTMJc1yPgEqF0ZjQbhnc/bFhyvIyVNb9Zk5mIGtqVaAB1Ttl28su8AvFMOY0EwRbAe+HCLqj6W7/KA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/13.0.0/typography.min.css" integrity="sha512-Y1DYSb995BAfxobCkKepB1BqJJTPrOp3zPL74AWFugHHmmdcvO+C48WLrUOlhGMc0QG7AE3f7gmvvcrmX2fDoA==" crossorigin>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:1.5em;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:2em 0 .50em 0}h3{font-size:1.4em;margin:1.6em 0 .7em 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .2s ease-in-out}a:visited{color:#503}a:hover{color:#b62}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900;font-weight:bold}pre code{font-size:.8em;line-height:1.4em;padding:1em;display:block}code{background:#f3f3f3;font-family:"DejaVu Sans Mono",monospace;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source > summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible;min-width:max-content}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em 1em;margin:1em 0}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul ul{padding-left:1em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js" integrity="sha512-D9gUyxqja7hBtkWpPWGt9wfbfaMGVt9gnyCvYa+jojwwPHLCzUm5i8rpk7vD7wNee9bA35eYIjobYPaQuKS1MQ==" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => {
hljs.configure({languages: ['bash', 'css', 'diff', 'graphql', 'ini', 'javascript', 'json', 'plaintext', 'python', 'python-repl', 'rust', 'shell', 'sql', 'typescript', 'xml', 'yaml']});
hljs.highlightAll();
/* Collapse source docstrings */
setTimeout(() => {
[...document.querySelectorAll('.hljs.language-python > .hljs-string')]
.filter(el => el.innerHTML.length > 200 && ['"""', "'''"].includes(el.innerHTML.substring(0, 3)))
.forEach(el => {
let d = document.createElement('details');
d.classList.add('hljs-string');
d.innerHTML = '<summary>"""</summary>' + el.innerHTML.substring(3);
el.replaceWith(d);
});
}, 100);
})</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>lang_main.analysis.tokens</code></h1>
</header>
<section id="section-intro">
</section>
<section>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="lang_main.analysis.tokens.add_doc_info_to_graph"><code class="name flex">
<span>def <span class="ident">add_doc_info_to_graph</span></span>(<span>graph: <a title="lang_main.analysis.graphs.TokenGraph" href="graphs.html#lang_main.analysis.graphs.TokenGraph">TokenGraph</a>,<br>doc: spacy.tokens.doc.Doc,<br>weight: int | None) > None</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def add_doc_info_to_graph(
graph: TokenGraph,
doc: SpacyDoc,
weight: int | None,
) -&gt; None:
# iterate over sentences
for sent in doc.sents:
# iterate over tokens in sentence
for token in sent:
# skip tokens which are not relevant
if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
continue
# skip token which are dates or times
if token.pos_ == &#39;NUM&#39; and is_str_date(string=token.text):
continue
relevant_descendants = obtain_relevant_descendants(token=token)
# for non-AUX: add parent &lt;--&gt; descendant pair to graph
if token.pos_ not in POS_INDIRECT:
for descendant in relevant_descendants:
# add descendant and parent to graph
update_graph(
graph=graph,
parent=token.lemma_,
child=descendant.lemma_,
weight_connection=weight,
)
else:
# if indirect POS, make connection between all associated words
combs = combinations(relevant_descendants, r=2)
for comb in combs:
# !! parents and children do not really exist in this case,
# !! but only one connection is made
update_graph(
graph=graph,
parent=comb[0].lemma_,
child=comb[1].lemma_,
weight_connection=weight,
)</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.build_token_graph"><code class="name flex">
<span>def <span class="ident">build_token_graph</span></span>(<span>data: pandas.core.frame.DataFrame,<br>model: spacy.language.Language,<br>*,<br>target_feature: str = 'entry',<br>weights_feature: str | None = None,<br>batch_idx_feature: str | None = 'batched_idxs',<br>build_map: bool = True,<br>batch_size_model: int = 50,<br>logging_graph: bool = True) > tuple[<a title="lang_main.analysis.graphs.TokenGraph" href="graphs.html#lang_main.analysis.graphs.TokenGraph">TokenGraph</a>, dict[int | numpy.int64, spacy.tokens.doc.Doc] | None]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def build_token_graph(
data: DataFrame,
model: SpacyModel,
*,
target_feature: str = &#39;entry&#39;,
weights_feature: str | None = None,
batch_idx_feature: str | None = &#39;batched_idxs&#39;,
build_map: bool = True,
batch_size_model: int = 50,
logging_graph: bool = True,
) -&gt; tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
graph = TokenGraph(enable_logging=logging_graph)
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
if weights_feature is not None:
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
else:
weights = None
docs_mapping: dict[PandasIndex, SpacyDoc] | None
if build_map and batch_idx_feature is None:
raise ValueError(&#39;Can not build mapping if batched indices are unknown.&#39;)
elif build_map:
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
docs_mapping = {}
else:
indices = None
docs_mapping = None
index: int = 0
for doc in tqdm(
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
):
weight: int | None = None
if weights is not None:
weight = weights[index]
add_doc_info_to_graph(
graph=graph,
doc=doc,
weight=weight,
)
# build map if option chosen
if indices is not None and docs_mapping is not None:
corresponding_indices = indices[index]
for idx in corresponding_indices:
docs_mapping[idx] = doc
index += 1
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected(logging=False)
graph.perform_static_analysis()
return graph, docs_mapping</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.is_str_date"><code class="name flex">
<span>def <span class="ident">is_str_date</span></span>(<span>string: str, fuzzy: bool = False) > bool</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def is_str_date(
string: str,
fuzzy: bool = False,
) -&gt; bool:
&#34;&#34;&#34;not stable function to test strings for dates, not 100 percent reliable
Parameters
----------
string : str
string to check for dates
fuzzy : bool, optional
whether to use dateutils.parser.pase fuzzy capability, by default False
Returns
-------
bool
indicates whether date was found or not
&#34;&#34;&#34;
try:
# check if string is a number
# if length is greater than 8, it is not a date
int(string)
if len(string) not in {2, 4}:
return False
except ValueError:
# not a number
pass
try:
parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False)
return True
except ValueError:
date_found: bool = False
match = pattern_dates.search(string)
if match is None:
return date_found
date_found = any(match.groups())
return date_found</code></pre>
</details>
<div class="desc"><p>not stable function to test strings for dates, not 100 percent reliable</p>
<h2 id="parameters">Parameters</h2>
<dl>
<dt><strong><code>string</code></strong> :&ensp;<code>str</code></dt>
<dd>string to check for dates</dd>
<dt><strong><code>fuzzy</code></strong> :&ensp;<code>bool</code>, optional</dt>
<dd>whether to use dateutils.parser.pase fuzzy capability, by default False</dd>
</dl>
<h2 id="returns">Returns</h2>
<dl>
<dt><code>bool</code></dt>
<dd>indicates whether date was found or not</dd>
</dl></div>
</dd>
<dt id="lang_main.analysis.tokens.obtain_relevant_descendants"><code class="name flex">
<span>def <span class="ident">obtain_relevant_descendants</span></span>(<span>token: spacy.tokens.token.Token) > Iterator[spacy.tokens.token.Token]</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def obtain_relevant_descendants(
token: SpacyToken,
) -&gt; Iterator[SpacyToken]:
for descendant in token.subtree:
# subtrees contain the token itself
# if current element is token skip this element
if descendant == token:
continue
# if descendant is a date skip it)
if is_str_date(string=descendant.text):
continue
logger.debug(
&#39;Token &gt;&gt;%s&lt;&lt;, POS &gt;&gt;%s&lt;&lt; | descendant &gt;&gt;%s&lt;&lt;, POS &gt;&gt;%s&lt;&lt;&#39;,
token,
token.pos_,
descendant,
descendant.pos_,
)
# eliminate cases of cross-references with verbs
if (token.pos_ == &#39;AUX&#39; or token.pos_ == &#39;VERB&#39;) and (
descendant.pos_ == &#39;AUX&#39; or descendant.pos_ == &#39;VERB&#39;
):
continue
# skip cases in which descendant is indirect POS with others than verbs
elif descendant.pos_ in POS_INDIRECT:
continue
# skip cases in which child has no relevant POS or TAG
elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
continue
yield descendant
# TODO look at results and fine-tune function accordingly</code></pre>
</details>
<div class="desc"></div>
</dd>
<dt id="lang_main.analysis.tokens.pre_clean_word"><code class="name flex">
<span>def <span class="ident">pre_clean_word</span></span>(<span>string: str) > str</span>
</code></dt>
<dd>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def pre_clean_word(string: str) -&gt; str:
pattern = r&#39;[^A-Za-zäöüÄÖÜ]+&#39;
string = re.sub(pattern, &#39;&#39;, string)
return string</code></pre>
</details>
<div class="desc"></div>
</dd>
</dl>
</section>
<section>
</section>
</article>
<nav id="sidebar">
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3>Super-module</h3>
<ul>
<li><code><a title="lang_main.analysis" href="index.html">lang_main.analysis</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="lang_main.analysis.tokens.add_doc_info_to_graph" href="#lang_main.analysis.tokens.add_doc_info_to_graph">add_doc_info_to_graph</a></code></li>
<li><code><a title="lang_main.analysis.tokens.build_token_graph" href="#lang_main.analysis.tokens.build_token_graph">build_token_graph</a></code></li>
<li><code><a title="lang_main.analysis.tokens.is_str_date" href="#lang_main.analysis.tokens.is_str_date">is_str_date</a></code></li>
<li><code><a title="lang_main.analysis.tokens.obtain_relevant_descendants" href="#lang_main.analysis.tokens.obtain_relevant_descendants">obtain_relevant_descendants</a></code></li>
<li><code><a title="lang_main.analysis.tokens.pre_clean_word" href="#lang_main.analysis.tokens.pre_clean_word">pre_clean_word</a></code></li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.11.5</a>.</p>
</footer>
</body>
</html>