Module `lang_main.analysis.tokens`

Functions

def add_doc_info_to_graph(graph: TokenGraph, doc: spacy.tokens.doc.Doc, weight: int | None) ‑> None

Expand source code

def add_doc_info_to_graph(
    graph: TokenGraph,
    doc: SpacyDoc,
    weight: int | None,
) -> None:
    # iterate over sentences
    for sent in doc.sents:
        # iterate over tokens in sentence
        for token in sent:
            # skip tokens which are not relevant
            if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
                continue
            # skip token which are dates or times
            if token.pos_ == 'NUM' and is_str_date(string=token.text):
                continue

            relevant_descendants = obtain_relevant_descendants(token=token)
            # for non-AUX: add parent <--> descendant pair to graph
            if token.pos_ not in POS_INDIRECT:
                for descendant in relevant_descendants:
                    # add descendant and parent to graph
                    update_graph(
                        graph=graph,
                        parent=token.lemma_,
                        child=descendant.lemma_,
                        weight_connection=weight,
                    )
            else:
                # if indirect POS, make connection between all associated words
                combs = combinations(relevant_descendants, r=2)
                for comb in combs:
                    # !! parents and children do not really exist in this case,
                    # !! but only one connection is made
                    update_graph(
                        graph=graph,
                        parent=comb[0].lemma_,
                        child=comb[1].lemma_,
                        weight_connection=weight,
                    )

def build_token_graph(data: pandas.core.frame.DataFrame, model: spacy.language.Language, *, target_feature: str = 'entry', weights_feature: str | None = None, batch_idx_feature: str | None = 'batched_idxs', build_map: bool = True, batch_size_model: int = 50, logging_graph: bool = True) ‑> tuple[TokenGraph, dict[int | numpy.int64, spacy.tokens.doc.Doc] | None]

Expand source code

def build_token_graph(
    data: DataFrame,
    model: SpacyModel,
    *,
    target_feature: str = 'entry',
    weights_feature: str | None = None,
    batch_idx_feature: str | None = 'batched_idxs',
    build_map: bool = True,
    batch_size_model: int = 50,
    logging_graph: bool = True,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
    graph = TokenGraph(enable_logging=logging_graph)
    model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
    if weights_feature is not None:
        weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
    else:
        weights = None

    docs_mapping: dict[PandasIndex, SpacyDoc] | None
    if build_map and batch_idx_feature is None:
        raise ValueError('Can not build mapping if batched indices are unknown.')
    elif build_map:
        indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
        docs_mapping = {}
    else:
        indices = None
        docs_mapping = None

    index: int = 0

    for doc in tqdm(
        model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
    ):
        weight: int | None = None
        if weights is not None:
            weight = weights[index]

        add_doc_info_to_graph(
            graph=graph,
            doc=doc,
            weight=weight,
        )
        # build map if option chosen
        if indices is not None and docs_mapping is not None:
            corresponding_indices = indices[index]
            for idx in corresponding_indices:
                docs_mapping[idx] = doc

        index += 1

    # metadata
    graph.update_metadata()
    # convert to undirected
    graph.to_undirected(logging=False)
    graph.perform_static_analysis()

    return graph, docs_mapping

def is_str_date(string: str, fuzzy: bool = False) ‑> bool

Expand source code

def is_str_date(
    string: str,
    fuzzy: bool = False,
) -> bool:
    """not stable function to test strings for dates, not 100 percent reliable

    Parameters
    ----------
    string : str
        string to check for dates
    fuzzy : bool, optional
        whether to use dateutils.parser.pase fuzzy capability, by default False

    Returns
    -------
    bool
        indicates whether date was found or not
    """
    try:
        # check if string is a number
        # if length is greater than 8, it is not a date
        int(string)
        if len(string) not in {2, 4}:
            return False
    except ValueError:
        # not a number
        pass

    try:
        parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False)
        return True
    except ValueError:
        date_found: bool = False
        match = pattern_dates.search(string)
        if match is None:
            return date_found
        date_found = any(match.groups())
        return date_found

not stable function to test strings for dates, not 100 percent reliable

Parameters

string : str: string to check for dates
fuzzy : bool, optional: whether to use dateutils.parser.pase fuzzy capability, by default False

Returns

bool: indicates whether date was found or not

def obtain_relevant_descendants(token: spacy.tokens.token.Token) ‑> Iterator[spacy.tokens.token.Token]

Expand source code

def obtain_relevant_descendants(
    token: SpacyToken,
) -> Iterator[SpacyToken]:
    for descendant in token.subtree:
        # subtrees contain the token itself
        # if current element is token skip this element
        if descendant == token:
            continue

        # if descendant is a date skip it)
        if is_str_date(string=descendant.text):
            continue

        logger.debug(
            'Token >>%s<<, POS >>%s<< | descendant >>%s<<, POS >>%s<<',
            token,
            token.pos_,
            descendant,
            descendant.pos_,
        )

        # eliminate cases of cross-references with verbs
        if (token.pos_ == 'AUX' or token.pos_ == 'VERB') and (
            descendant.pos_ == 'AUX' or descendant.pos_ == 'VERB'
        ):
            continue
        # skip cases in which descendant is indirect POS with others than verbs
        elif descendant.pos_ in POS_INDIRECT:
            continue
        # skip cases in which child has no relevant POS or TAG
        elif not (descendant.pos_ in POS_OF_INTEREST or descendant.tag_ in TAG_OF_INTEREST):
            continue

        yield descendant

        # TODO look at results and fine-tune function accordingly

def pre_clean_word(string: str) ‑> str

Expand source code

def pre_clean_word(string: str) -> str:
    pattern = r'[^A-Za-zäöüÄÖÜ]+'
    string = re.sub(pattern, '', string)

    return string