improved dashboard, fixed language tags, tests graph plotting

2024-06-26 16:13:53 +02:00
parent fb4437a3a2
commit 2656780907
11 changed files with 541 additions and 1714 deletions
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import shutil
 import sys
 from pathlib import Path
@@ -41,3 +42,9 @@ else:
    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)

 CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
+
+# append Graphviz binary folder to system path if not already contained
+if sys.platform == 'win32':
+    path = Path(r'C:\Program Files\Graphviz\bin')
+    if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
+        os.environ['PATH'] += f';{path}'
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@@ -164,8 +164,11 @@ def convert_graph_to_cytoscape(
        }
        cyto_data.append(edge_data)

-    min_weight = min(weights)
-    max_weight = max(weights)
+    min_weight: int = 0
+    max_weight: int = 0
+    if weights:
+        min_weight = min(weights)
+        max_weight = max(weights)
    weight_metadata: WeightData = {'min': min_weight, 'max': max_weight}

    return cyto_data, weight_metadata
@@ -284,6 +287,9 @@ class TokenGraph(DiGraph):
            f'{len(self.edges)})'
        )

+    def disable_logging(self) -> None:
+        self.logging = False
+
    # !! only used to verify that saving was done correctly
    """
    def __key(self) -> tuple[Hashable, ...]:
@@ -337,16 +343,9 @@ class TokenGraph(DiGraph):
        logging: bool | None = ...,
    ) -> Graph: ...

-    @overload
    def to_undirected(
        self,
-        inplace: bool = ...,
-        logging: bool | None = ...,
-    ) -> Graph | None: ...
-
-    def to_undirected(
-        self,
-        inplace=True,
+        inplace: bool = True,
        logging: bool | None = None,
    ) -> Graph | None:
        if logging is None:
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@@ -21,7 +21,8 @@ from lang_main.types import PandasIndex
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
-POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
+POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])

 # POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
 POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
@@ -153,6 +154,7 @@ def build_token_graph(
    batch_idx_feature: str = ...,
    build_map: Literal[False],
    batch_size_model: int = ...,
+    logging_graph: bool = ...,
 ) -> tuple[TokenGraph, None]: ...


@@ -166,6 +168,7 @@ def build_token_graph(
    batch_idx_feature: str = ...,
    build_map: Literal[True] = ...,
    batch_size_model: int = ...,
+    logging_graph: bool = ...,
 ) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: ...


@@ -175,11 +178,12 @@ def build_token_graph(
    *,
    target_feature: str = 'entry',
    weights_feature: str | None = None,
-    batch_idx_feature: str = 'batched_idxs',
+    batch_idx_feature: str | None = 'batched_idxs',
    build_map: bool = True,
    batch_size_model: int = 50,
+    logging_graph: bool = True,
 ) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
-    graph = TokenGraph()
+    graph = TokenGraph(enable_logging=logging_graph)
    model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
    if weights_feature is not None:
        weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
@@ -187,7 +191,9 @@ def build_token_graph(
        weights = None

    docs_mapping: dict[PandasIndex, SpacyDoc] | None
-    if build_map:
+    if build_map and batch_idx_feature is None:
+        raise ValueError('Can not build mapping if batched indices are unknown.')
+    elif build_map:
        indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
        docs_mapping = {}
    else:
@@ -199,10 +205,10 @@ def build_token_graph(
    for doc in tqdm(
        model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
    ):
+        weight: int | None = None
        if weights is not None:
            weight = weights[index]
-        else:
-            weight = None
+
        add_doc_info_to_graph(
            graph=graph,
            doc=doc,
@@ -219,7 +225,7 @@ def build_token_graph(
    # metadata
    graph.update_metadata()
    # convert to undirected
-    graph.to_undirected()
+    graph.to_undirected(logging=False)

    return graph, docs_mapping

@@ -250,7 +256,7 @@ def build_token_graph_simple(
    # metadata
    graph.update_metadata()
    # convert to undirected
-    graph.to_undirected()
+    graph.to_undirected(logging=False)

    return graph, docs_mapping

--- a/src/lang_main/io.py
+++ b/src/lang_main/io.py
@@ -1,4 +1,5 @@
 import pickle
+import base64
 import shutil
 import tomllib
 from pathlib import Path
@@ -61,6 +62,24 @@ def load_pickle(
    return obj


+def encode_to_base64_str(
+    obj: Any,
+    encoding: str = 'utf-8',
+) -> str:
+    serialised = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+    b64_bytes = base64.b64encode(serialised)
+    return b64_bytes.decode(encoding=encoding)
+
+
+def decode_from_base64_str(
+    b64_str: str,
+    encoding: str = 'utf-8',
+) -> Any:
+    b64_bytes = b64_str.encode(encoding=encoding)
+    decoded = base64.b64decode(b64_bytes)
+    return pickle.loads(decoded)
+
+
 def get_entry_point(
    saving_path: Path,
    filename: str,
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@@ -97,6 +97,18 @@ class BasePipeline(ABC):


 class PipelineContainer(BasePipeline):
+    """Container class for basic actions.
+    Basic actions are usually functions, which do not take any parameters
+    and return nothing. Indeed, if an action returns any values after its
+    procedure is finished, an error is raised. Therefore, PipelineContainers
+    can be seen as a concatenation of many (independent) simple procedures
+    which are executed in the order in which they were added to the pipe.
+    With a simple call of the ``run`` method the actions are performed.
+    Additionally, there is an option to skip actions which can be set in
+    the ``add`` method. This allows for easily configurable pipelines,
+    e.g., via a user configuration.
+    """
+
    def __init__(
        self,
        name: str,
@@ -169,9 +181,6 @@ class Pipeline(BasePipeline):
            f'working dir: {self.working_dir}, contents: {self.action_names})'
        )

-    # @property
-    # def intermediate_result(self) -> tuple[Any, ...] | None:
-    #     return self._intermediate_result
    @override
    def add(
        self,