improved dashboard, fixed language tags, tests graph plotting

This commit is contained in:
Florian Förster
2024-06-26 16:13:53 +02:00
parent fb4437a3a2
commit 2656780907
11 changed files with 541 additions and 1714 deletions

View File

@@ -1,4 +1,5 @@
import logging
import os
import shutil
import sys
from pathlib import Path
@@ -41,3 +42,9 @@ else:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
# append Graphviz binary folder to system path if not already contained
if sys.platform == 'win32':
path = Path(r'C:\Program Files\Graphviz\bin')
if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
os.environ['PATH'] += f';{path}'

View File

@@ -164,8 +164,11 @@ def convert_graph_to_cytoscape(
}
cyto_data.append(edge_data)
min_weight = min(weights)
max_weight = max(weights)
min_weight: int = 0
max_weight: int = 0
if weights:
min_weight = min(weights)
max_weight = max(weights)
weight_metadata: WeightData = {'min': min_weight, 'max': max_weight}
return cyto_data, weight_metadata
@@ -284,6 +287,9 @@ class TokenGraph(DiGraph):
f'{len(self.edges)})'
)
def disable_logging(self) -> None:
self.logging = False
# !! only used to verify that saving was done correctly
"""
def __key(self) -> tuple[Hashable, ...]:
@@ -337,16 +343,9 @@ class TokenGraph(DiGraph):
logging: bool | None = ...,
) -> Graph: ...
@overload
def to_undirected(
self,
inplace: bool = ...,
logging: bool | None = ...,
) -> Graph | None: ...
def to_undirected(
self,
inplace=True,
inplace: bool = True,
logging: bool | None = None,
) -> Graph | None:
if logging is None:

View File

@@ -21,7 +21,8 @@ from lang_main.types import PandasIndex
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
@@ -153,6 +154,7 @@ def build_token_graph(
batch_idx_feature: str = ...,
build_map: Literal[False],
batch_size_model: int = ...,
logging_graph: bool = ...,
) -> tuple[TokenGraph, None]: ...
@@ -166,6 +168,7 @@ def build_token_graph(
batch_idx_feature: str = ...,
build_map: Literal[True] = ...,
batch_size_model: int = ...,
logging_graph: bool = ...,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: ...
@@ -175,11 +178,12 @@ def build_token_graph(
*,
target_feature: str = 'entry',
weights_feature: str | None = None,
batch_idx_feature: str = 'batched_idxs',
batch_idx_feature: str | None = 'batched_idxs',
build_map: bool = True,
batch_size_model: int = 50,
logging_graph: bool = True,
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
graph = TokenGraph()
graph = TokenGraph(enable_logging=logging_graph)
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
if weights_feature is not None:
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
@@ -187,7 +191,9 @@ def build_token_graph(
weights = None
docs_mapping: dict[PandasIndex, SpacyDoc] | None
if build_map:
if build_map and batch_idx_feature is None:
raise ValueError('Can not build mapping if batched indices are unknown.')
elif build_map:
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
docs_mapping = {}
else:
@@ -199,10 +205,10 @@ def build_token_graph(
for doc in tqdm(
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
):
weight: int | None = None
if weights is not None:
weight = weights[index]
else:
weight = None
add_doc_info_to_graph(
graph=graph,
doc=doc,
@@ -219,7 +225,7 @@ def build_token_graph(
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected()
graph.to_undirected(logging=False)
return graph, docs_mapping
@@ -250,7 +256,7 @@ def build_token_graph_simple(
# metadata
graph.update_metadata()
# convert to undirected
graph.to_undirected()
graph.to_undirected(logging=False)
return graph, docs_mapping

View File

@@ -1,4 +1,5 @@
import pickle
import base64
import shutil
import tomllib
from pathlib import Path
@@ -61,6 +62,24 @@ def load_pickle(
return obj
def encode_to_base64_str(
obj: Any,
encoding: str = 'utf-8',
) -> str:
serialised = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
b64_bytes = base64.b64encode(serialised)
return b64_bytes.decode(encoding=encoding)
def decode_from_base64_str(
b64_str: str,
encoding: str = 'utf-8',
) -> Any:
b64_bytes = b64_str.encode(encoding=encoding)
decoded = base64.b64decode(b64_bytes)
return pickle.loads(decoded)
def get_entry_point(
saving_path: Path,
filename: str,

View File

@@ -97,6 +97,18 @@ class BasePipeline(ABC):
class PipelineContainer(BasePipeline):
"""Container class for basic actions.
Basic actions are usually functions, which do not take any parameters
and return nothing. Indeed, if an action returns any values after its
procedure is finished, an error is raised. Therefore, PipelineContainers
can be seen as a concatenation of many (independent) simple procedures
which are executed in the order in which they were added to the pipe.
With a simple call of the ``run`` method the actions are performed.
Additionally, there is an option to skip actions which can be set in
the ``add`` method. This allows for easily configurable pipelines,
e.g., via a user configuration.
"""
def __init__(
self,
name: str,
@@ -169,9 +181,6 @@ class Pipeline(BasePipeline):
f'working dir: {self.working_dir}, contents: {self.action_names})'
)
# @property
# def intermediate_result(self) -> tuple[Any, ...] | None:
# return self._intermediate_result
@override
def add(
self,