improved dashboard, fixed language tags, tests graph plotting
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -41,3 +42,9 @@ else:
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||
|
||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||
|
||||
# append Graphviz binary folder to system path if not already contained
|
||||
if sys.platform == 'win32':
|
||||
path = Path(r'C:\Program Files\Graphviz\bin')
|
||||
if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
|
||||
os.environ['PATH'] += f';{path}'
|
||||
|
||||
@@ -164,8 +164,11 @@ def convert_graph_to_cytoscape(
|
||||
}
|
||||
cyto_data.append(edge_data)
|
||||
|
||||
min_weight = min(weights)
|
||||
max_weight = max(weights)
|
||||
min_weight: int = 0
|
||||
max_weight: int = 0
|
||||
if weights:
|
||||
min_weight = min(weights)
|
||||
max_weight = max(weights)
|
||||
weight_metadata: WeightData = {'min': min_weight, 'max': max_weight}
|
||||
|
||||
return cyto_data, weight_metadata
|
||||
@@ -284,6 +287,9 @@ class TokenGraph(DiGraph):
|
||||
f'{len(self.edges)})'
|
||||
)
|
||||
|
||||
def disable_logging(self) -> None:
|
||||
self.logging = False
|
||||
|
||||
# !! only used to verify that saving was done correctly
|
||||
"""
|
||||
def __key(self) -> tuple[Hashable, ...]:
|
||||
@@ -337,16 +343,9 @@ class TokenGraph(DiGraph):
|
||||
logging: bool | None = ...,
|
||||
) -> Graph: ...
|
||||
|
||||
@overload
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace: bool = ...,
|
||||
logging: bool | None = ...,
|
||||
) -> Graph | None: ...
|
||||
|
||||
def to_undirected(
|
||||
self,
|
||||
inplace=True,
|
||||
inplace: bool = True,
|
||||
logging: bool | None = None,
|
||||
) -> Graph | None:
|
||||
if logging is None:
|
||||
|
||||
@@ -21,7 +21,8 @@ from lang_main.types import PandasIndex
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
|
||||
|
||||
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||
@@ -153,6 +154,7 @@ def build_token_graph(
|
||||
batch_idx_feature: str = ...,
|
||||
build_map: Literal[False],
|
||||
batch_size_model: int = ...,
|
||||
logging_graph: bool = ...,
|
||||
) -> tuple[TokenGraph, None]: ...
|
||||
|
||||
|
||||
@@ -166,6 +168,7 @@ def build_token_graph(
|
||||
batch_idx_feature: str = ...,
|
||||
build_map: Literal[True] = ...,
|
||||
batch_size_model: int = ...,
|
||||
logging_graph: bool = ...,
|
||||
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]: ...
|
||||
|
||||
|
||||
@@ -175,11 +178,12 @@ def build_token_graph(
|
||||
*,
|
||||
target_feature: str = 'entry',
|
||||
weights_feature: str | None = None,
|
||||
batch_idx_feature: str = 'batched_idxs',
|
||||
batch_idx_feature: str | None = 'batched_idxs',
|
||||
build_map: bool = True,
|
||||
batch_size_model: int = 50,
|
||||
logging_graph: bool = True,
|
||||
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None]:
|
||||
graph = TokenGraph()
|
||||
graph = TokenGraph(enable_logging=logging_graph)
|
||||
model_input = cast(tuple[str], tuple(data[target_feature].to_list()))
|
||||
if weights_feature is not None:
|
||||
weights = cast(tuple[int], tuple(data[weights_feature].to_list()))
|
||||
@@ -187,7 +191,9 @@ def build_token_graph(
|
||||
weights = None
|
||||
|
||||
docs_mapping: dict[PandasIndex, SpacyDoc] | None
|
||||
if build_map:
|
||||
if build_map and batch_idx_feature is None:
|
||||
raise ValueError('Can not build mapping if batched indices are unknown.')
|
||||
elif build_map:
|
||||
indices = cast(tuple[list[PandasIndex]], tuple(data[batch_idx_feature].to_list()))
|
||||
docs_mapping = {}
|
||||
else:
|
||||
@@ -199,10 +205,10 @@ def build_token_graph(
|
||||
for doc in tqdm(
|
||||
model.pipe(model_input, batch_size=batch_size_model), total=len(model_input)
|
||||
):
|
||||
weight: int | None = None
|
||||
if weights is not None:
|
||||
weight = weights[index]
|
||||
else:
|
||||
weight = None
|
||||
|
||||
add_doc_info_to_graph(
|
||||
graph=graph,
|
||||
doc=doc,
|
||||
@@ -219,7 +225,7 @@ def build_token_graph(
|
||||
# metadata
|
||||
graph.update_metadata()
|
||||
# convert to undirected
|
||||
graph.to_undirected()
|
||||
graph.to_undirected(logging=False)
|
||||
|
||||
return graph, docs_mapping
|
||||
|
||||
@@ -250,7 +256,7 @@ def build_token_graph_simple(
|
||||
# metadata
|
||||
graph.update_metadata()
|
||||
# convert to undirected
|
||||
graph.to_undirected()
|
||||
graph.to_undirected(logging=False)
|
||||
|
||||
return graph, docs_mapping
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import pickle
|
||||
import base64
|
||||
import shutil
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
@@ -61,6 +62,24 @@ def load_pickle(
|
||||
return obj
|
||||
|
||||
|
||||
def encode_to_base64_str(
|
||||
obj: Any,
|
||||
encoding: str = 'utf-8',
|
||||
) -> str:
|
||||
serialised = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
b64_bytes = base64.b64encode(serialised)
|
||||
return b64_bytes.decode(encoding=encoding)
|
||||
|
||||
|
||||
def decode_from_base64_str(
|
||||
b64_str: str,
|
||||
encoding: str = 'utf-8',
|
||||
) -> Any:
|
||||
b64_bytes = b64_str.encode(encoding=encoding)
|
||||
decoded = base64.b64decode(b64_bytes)
|
||||
return pickle.loads(decoded)
|
||||
|
||||
|
||||
def get_entry_point(
|
||||
saving_path: Path,
|
||||
filename: str,
|
||||
|
||||
@@ -97,6 +97,18 @@ class BasePipeline(ABC):
|
||||
|
||||
|
||||
class PipelineContainer(BasePipeline):
|
||||
"""Container class for basic actions.
|
||||
Basic actions are usually functions, which do not take any parameters
|
||||
and return nothing. Indeed, if an action returns any values after its
|
||||
procedure is finished, an error is raised. Therefore, PipelineContainers
|
||||
can be seen as a concatenation of many (independent) simple procedures
|
||||
which are executed in the order in which they were added to the pipe.
|
||||
With a simple call of the ``run`` method the actions are performed.
|
||||
Additionally, there is an option to skip actions which can be set in
|
||||
the ``add`` method. This allows for easily configurable pipelines,
|
||||
e.g., via a user configuration.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
@@ -169,9 +181,6 @@ class Pipeline(BasePipeline):
|
||||
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||
)
|
||||
|
||||
# @property
|
||||
# def intermediate_result(self) -> tuple[Any, ...] | None:
|
||||
# return self._intermediate_result
|
||||
@override
|
||||
def add(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user