Module lang_main.analysis.tokens
Functions
def add_doc_info_to_graph(graph: TokenGraph,
doc: spacy.tokens.doc.Doc,
weight: int | None) ‑> None-
Expand source code
def add_doc_info_to_graph( graph: TokenGraph, doc: SpacyDoc, weight: int | None, ) -> None: # iterate over sentences for sent in doc.sents: # iterate over tokens in sentence for token in sent: # skip tokens which are not relevant if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST): continue # skip token which are dates or times if token.pos_ == 'NUM' and is_str_date(string=token.text): continue relevant_descendants = obtain_relevant_descendants(token=token) # for non-AUX: add parent <--> descendant pair to graph if token.pos_ not in POS_INDIRECT: for descendant in relevant_descendants: # add descendant and parent to graph update_graph( graph=graph, parent=token.lemma_, child=descendant.lemma_, weight_connection=weight, ) else: # if indirect POS, make connection between all associated words combs = combinations(relevant_descendants, r=2) for comb in combs: # !! parents and children do not really exist in this case, # !! but only one connection is made update_graph( graph=graph, parent=comb[0].lemma_, child=comb[1].lemma_, weight_connection=weight, ) def build_token_graph(data: pandas.core.frame.DataFrame,
model: spacy.language.Language,
*,
target_feature: str = 'entry',
weights_feature: str | None = None,
batch_idx_feature: str | None = 'batched_idxs',
build_map: bool = True,
batch_size_model: int = 50,
logging_graph: bool = True) ‑> tuple[TokenGraph, dict[int | numpy.int64, spacy.tokens.doc.Doc] | None]-
Expand source code
def build_token_graph( data: DataFrame, model: SpacyModel, *, target_feature: str = 'entry', weights_feature: str | None = None, batch_idx_feature: str | None = 'batched_idxs', build_map: bool = True, batch_size_model: int = 50, logging_graph: bool = True, ) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]: graph = TokenGraph(enable_logging=logging_graph) model_input = cast(tuple[str], tuple(data[target_feature].to_list())) if weights_feature is not None: weights = cast(tuple[int], tuple(data[weights_feature].to_list())) else: weights = None docs_mapping: dict[PandasIndex, SpacyDoc] | None if build_map and batch_idx_feature is None: raise ValueError('Can not build mapping if batched indices are unknown.') elif build_map: indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list())) docs_mapping = {} else: indices = None docs_mapping = None index: int = 0 for doc in tqdm( model.pipe(model_input, batch_size=batch_size_model), total=len(model_input) ): weight: int | None = None if weights is not None: weight = weights[index] add_doc_info_to_graph( graph=graph, doc=doc, weight=weight, ) # build map if option chosen if indices is not None and docs_mapping is not None: corresponding_indices = indices[index] for idx in corresponding_indices: docs_mapping[idx] = doc index += 1 # metadata graph.update_metadata() # convert to undirected graph.to_undirected(logging=False) graph.perform_static_analysis() return graph, docs_mapping def is_str_date(string: str, fuzzy: bool = False) ‑> bool-
Expand source code
def is_str_date( string: str, fuzzy: bool = False, ) -> bool: """not stable function to test strings for dates, not 100 percent reliable Parameters ---------- string : str string to check for dates fuzzy : bool, optional whether to use dateutils.parser.pase fuzzy capability, by default False Returns ------- bool indicates whether date was found or not """ try: # check if string is a number # if length is greater than 8, it is not a date int(string) if len(string) not in {2, 4}: return False except ValueError: # not a number pass try: parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False) return True except ValueError: date_found: bool = False match = pattern_dates.search(string) if match is None: return date_found date_found = any(match.groups()) return date_foundnot stable function to test strings for dates, not 100 percent reliable
Parameters
string:str- string to check for dates
fuzzy:bool, optional- whether to use dateutils.parser.pase fuzzy capability, by default False
Returns
bool- indicates whether date was found or not
def obtain_relevant_descendants(token: spacy.tokens.token.Token) ‑> Iterator[spacy.tokens.token.Token]-
Expand source code
def obtain_relevant_descendants( token: SpacyToken, ) -> Iterator[SpacyToken]: for descendant in token.subtree: # subtrees contain the token itself # if current element is token skip this element if descendant == token: continue # if descendant is a date skip it) if is_str_date(string=descendant.text): continue logger.debug( 'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<', token, token.pos_, descendant, descendant.pos_, ) # eliminate cases of cross-references with verbs if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and ( descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB' ): continue # skip cases in which descendant is indirect POS with others than verbs elif descendant.pos_ in POS_INDIRECT: continue # skip cases in which child has no relevant POS or TAG elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST): continue yield descendant # TODO look at results and fine-tune function accordingly def pre_clean_word(string: str) ‑> str-
Expand source code
def pre_clean_word(string: str) -> str: pattern = r'[^A-Za-zäöüÄÖÜ]+' string = re.sub(pattern, '', string) return string