added new graph metrics

2024-12-19 16:26:01 +01:00
parent 123869e203
commit 80a35c4658
24 changed files with 826 additions and 97 deletions
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@@ -16,12 +16,15 @@ from pandas import DataFrame
 from lang_main.constants import (
    EDGE_WEIGHT_DECIMALS,
    LOGGING_DEFAULT_GRAPHS,
+    PROPERTY_NAME_BETWEENNESS_CENTRALITY,
    PROPERTY_NAME_DEGREE_WEIGHTED,
+    PROPERTY_NAME_IMPORTANCE,
 )
 from lang_main.errors import (
    EdgePropertyNotContainedError,
    EmptyEdgesError,
    EmptyGraphError,
+    NodePropertyNotContainedError,
 )
 from lang_main.io import load_pickle, save_pickle
 from lang_main.loggers import logger_graphs as logger
@@ -310,15 +313,98 @@ def add_weighted_degree(
        property of the edges which contains the weight information, by default 'weight'
    property_name : str, optional
        target name for the property containing the weighted degree in nodes,
-        by default 'degree_weighted'
+        by default PROPERTY_NAME_DEGREE_WEIGHTED
    """
-    node_degree_mapping = cast(
+    node_property_mapping = cast(
        dict[str, float],
        dict(graph.degree(weight=edge_weight_property)),  # type: ignore
    )
    nx.set_node_attributes(
        graph,
-        node_degree_mapping,
+        node_property_mapping,
+        name=property_name,
+    )
+
+
+def add_betweenness_centrality(
+    graph: DiGraph | Graph,
+    edge_weight_property: str | None = None,
+    property_name: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
+) -> None:
+    """adds the betweenness centrality as property to each node of the given graph
+    Operation is performed inplace.
+
+    Parameters
+    ----------
+    graph : DiGraph | Graph
+        Graph with betweenness centrality as node property added inplace
+    edge_weight_property : str | None, optional
+        property of the edges which contains the weight information,
+        not necessarily needed, by default 'None'
+    property_name : str, optional
+        target name for the property containing the betweenness centrality in nodes,
+        by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
+    """
+
+    node_property_mapping = cast(
+        dict[str, float],
+        nx.betweenness_centrality(graph, normalized=True, weight=edge_weight_property),  # type: ignore
+    )
+    nx.set_node_attributes(
+        graph,
+        node_property_mapping,
+        name=property_name,
+    )
+
+
+def add_importance_metric(
+    graph: DiGraph | Graph,
+    property_name: str = PROPERTY_NAME_IMPORTANCE,
+    property_name_weighted_degree: str = PROPERTY_NAME_DEGREE_WEIGHTED,
+    property_name_betweenness: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
+) -> None:
+    """Adds a custom importance metric as property to each node of the given graph.
+    Can be used to decide which nodes are of high importance and also to build node size
+    mappings.
+    Operation is performed inplace.
+
+    Parameters
+    ----------
+    graph : DiGraph | Graph
+        Graph with weighted degree as node property added inplace
+    property_name : str, optional
+        target name for the property containing the weighted degree in nodes,
+        by default PROPERTY_NAME_DEGREE_WEIGHTED
+    property_name_betweenness : str, optional
+        target name for the property containing the betweenness centrality in nodes,
+        by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
+    """
+    # build mapping for importance metric
+    node_property_mapping: dict[str, float] = {}
+    for node in cast(Iterable[str], graph.nodes):
+        node_data = cast(dict[str, float], graph.nodes[node])
+
+        if property_name_weighted_degree not in node_data:
+            raise NodePropertyNotContainedError(
+                (
+                    f'Node data does not contain weighted degree '
+                    f'with name {property_name_weighted_degree}.'
+                )
+            )
+        elif property_name_betweenness not in node_data:
+            raise NodePropertyNotContainedError(
+                (
+                    f'Node data does not contain betweenness centrality '
+                    f'with name {property_name_betweenness}.'
+                )
+            )
+
+        prio = node_data[property_name_weighted_degree] * node_data[property_name_betweenness]
+        node_property_mapping[node] = prio
+
+    nx.set_node_attributes(
+        graph,
+        node_property_mapping,
        name=property_name,
    )

@@ -351,6 +437,8 @@ def pipe_add_graph_metrics(
    for graph in graphs:
        graph_copy = copy.deepcopy(graph)
        add_weighted_degree(graph_copy)
+        add_betweenness_centrality(graph_copy)
+        add_importance_metric(graph_copy)
        collection.append(graph_copy)

    return tuple(collection)
@@ -762,19 +850,3 @@ class TokenGraph(DiGraph):
                raise ValueError('File format not supported.')

        return graph
-
-    # TODO check removal
-    # @classmethod
-    # def from_pickle(
-    #     cls,
-    #     path: str | Path,
-    # ) -> Self:
-    #     if isinstance(path, str):
-    #         path = Path(path)
-
-    #     if path.suffix not in ('.pkl', '.pickle'):
-    #         raise ValueError('File format not supported.')
-
-    #     graph = typing.cast(Self, load_pickle(path))
-
-    #     return graph
--- a/src/lang_main/config.py
+++ b/src/lang_main/config.py
@@ -19,7 +19,7 @@ except ImportError:

 # ** external packages config
 # ** Huggingface Hub caching
-os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
+os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

 # ** py4cytoscape config
 if _has_py4cyto:
@@ -36,7 +36,7 @@ BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang-
 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
 PKG_DIR: Final[Path] = Path(__file__).parent
-STOP_FOLDER: Final[str] = 'python'
+STOP_FOLDER: Final[str] = os.environ.get('LANG_MAIN_STOP_SEARCH_FOLDERNAME', 'src')


 def load_toml_config(
@@ -65,6 +65,7 @@ def load_cfg(
    starting_path: Path,
    glob_pattern: str,
    stop_folder_name: str | None,
+    lookup_cwd: bool = False,
 ) -> dict[str, Any]:
    """Look for configuration file. Internal configs are not used any more because
    the library behaviour is only guaranteed by external configurations.
@@ -91,9 +92,10 @@ def load_cfg(
    LangMainConfigNotFoundError
        if no config file was found
    """
-    cfg_path: Path | None
-    print('Looking for cfg file in CWD.', flush=True)
-    cfg_path = search_cwd(glob_pattern)
+    cfg_path: Path | None = None
+    if lookup_cwd:
+        print('Looking for cfg file in CWD.', flush=True)
+        cfg_path = search_cwd(glob_pattern)

    if cfg_path is None:
        print(
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@@ -54,17 +54,13 @@ PICKLE_PROTOCOL_VERSION: Final[int] = 5
 # config placed in library path of application (usually "bin")
 input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs'])
 INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve()
-# TODO reactivate later
-if not INPUT_PATH_FOLDER.exists():
+if not INPUT_PATH_FOLDER.exists():  # pragma: no cover
    raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
 save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results'])
 SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve()
-if not SAVE_PATH_FOLDER.exists():
+if not SAVE_PATH_FOLDER.exists():  # pragma: no cover
    raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.')
-path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset'])
-PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve()
-# if not PATH_TO_DATASET.exists():
-#     raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
+
 # ** control
 SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
 SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
@@ -82,22 +78,34 @@ MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve()
 if not MODEL_BASE_FOLDER.exists():
    raise FileNotFoundError('Language model folder not found.')
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
-SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM
-STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
+
+# LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
+# LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
+# LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
+# LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
+
+SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = os.environ.get(
+    'LANG_MAIN_SPACY_MODEL', SpacyModelTypes.DE_CORE_NEWS_SM
+)
+STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get(
+    'LANG_MAIN_STFR_MODEL', STFRModelTypes.ALL_MPNET_BASE_V2
+)
 STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
 STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
-STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
-STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
-STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
+STFR_BACKEND: Final[str | STFRBackends] = os.environ.get(
+    'LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH
+)
+stfr_model_args_default: STFRModelArgs = {}
+stfr_model_args_onnx: STFRModelArgs = {
    'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
    'provider': ONNXExecutionProvider.CPU,
    'export': False,
 }
 stfr_model_args: STFRModelArgs
 if STFR_BACKEND == STFRBackends.ONNX:
-    stfr_model_args = STFR_MODEL_ARGS_ONNX
+    stfr_model_args = stfr_model_args_onnx
 else:
-    stfr_model_args = STFR_MODEL_ARGS_DEFAULT
+    stfr_model_args = stfr_model_args_default

 STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
 # ** language dependency analysis
@@ -122,6 +130,8 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
 EDGE_WEIGHT_DECIMALS: Final[int] = 4
 THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
 PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
+PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality'
+PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance'

 # ** graph exports (Cytoscape)
 CYTO_MAX_NODE_COUNT: Final[int] = 500
--- a/src/lang_main/env_vars.txt
+++ b/src/lang_main/env_vars.txt
@@ -0,0 +1,6 @@
+# list of all library's environment variables
+LANG_MAIN_STOP_SEARCH_FOLDERNAME : foldername in package directory tree at which the lookup should stop; used to find directory root
+LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
+LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
+LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
+LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
--- a/src/lang_main/errors.py
+++ b/src/lang_main/errors.py
@@ -8,6 +8,10 @@ class LanguageModelNotFoundError(Exception):


 # ** token graph exceptions
+class NodePropertyNotContainedError(Exception):
+    """Error raised if a needed node property is not contained in graph edges"""
+
+
 class EdgePropertyNotContainedError(Exception):
    """Error raised if a needed edge property is not contained in graph edges"""

--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@@ -1,22 +1,18 @@
 # lang_main: Config file
-[info]
-pkg = 'lang_main_internal'

 [paths]
-inputs = './data/in/'
+inputs = '../data/in/'
 # results = './results/dummy_N_1000/'
 # dataset = '../data/Dummy_Dataset_N_1000.csv'
-results = './data/out/'
-dataset = '../data/02_202307/Export4.csv'
-models = '../../lang-models'
+results = '../data/out/'
+models = './lang-models'

 [logging]
 enabled = true
 stderr = true
 file = true

-# only debugging features, production-ready pipelines should always
-# be fully executed
+# control which pipelines are executed
 [control]
 preprocessing_skip = false
 token_analysis_skip = false
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@@ -33,7 +33,7 @@ if ENABLE_LOGGING and LOGGING_TO_STDERR:
    logger_all_handler_stderr = logging.StreamHandler()
    logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR)
    logger_all_handler_stderr.setFormatter(logger_all_formater)
-else:
+else:  # pragma: no cover
    logger_all_handler_stderr = null_handler

 if ENABLE_LOGGING and LOGGING_TO_FILE:
@@ -45,7 +45,7 @@ if ENABLE_LOGGING and LOGGING_TO_FILE:
    )
    logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE)
    logger_all_handler_file.setFormatter(logger_all_formater)
-else:
+else:  # pragma: no cover
    logger_all_handler_file = null_handler


--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@@ -33,6 +33,7 @@ class BasePipeline(ABC):
        # container for actions to perform during pass
        self.actions: list[Callable] = []
        self.action_names: list[str] = []
+        self.action_skip: list[bool] = []
        # progress tracking, start at 1
        self.curr_proc_idx: int = 1

@@ -104,8 +105,6 @@ class PipelineContainer(BasePipeline):
    ) -> None:
        super().__init__(name=name, working_dir=working_dir)

-        self.action_skip: list[bool] = []
-
    @override
    def add(
        self,
@@ -170,6 +169,7 @@ class Pipeline(BasePipeline):
        self,
        action: Callable,
        action_kwargs: dict[str, Any] | None = None,
+        skip: bool = False,
        save_result: bool = False,
        load_result: bool = False,
        filename: str | None = None,
@@ -183,6 +183,7 @@ class Pipeline(BasePipeline):
            self.actions.append(action)
            self.action_names.append(action.__name__)
            self.actions_kwargs.append(action_kwargs.copy())
+            self.action_skip.append(skip)
            self.save_results.append((save_result, filename))
            self.load_results.append((load_result, filename))
        else:
@@ -235,7 +236,13 @@ class Pipeline(BasePipeline):
        self,
        starting_values: tuple[Any, ...] | None = None,
    ) -> tuple[Any, ...]:
+        first_performed: bool = False
+
        for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
+            if self.action_skip[idx]:
+                self.curr_proc_idx += 1
+                continue
+
            # loading
            if self.load_results[idx][0]:
                filename = self.load_results[idx][1]
@@ -248,8 +255,9 @@ class Pipeline(BasePipeline):
                self.curr_proc_idx += 1
                continue
            # calculation
-            if idx == 0:
+            if not first_performed:
                args = starting_values
+                first_performed = True
            else:
                args = ret

--- a/src/lang_main/render/cytoscape.py
+++ b/src/lang_main/render/cytoscape.py
@@ -296,7 +296,7 @@ def apply_style_to_network(
    style_name: str = CYTO_STYLESHEET_NAME,
    pth_to_stylesheet: Path = CYTO_PATH_STYLESHEET,
    network_name: str = CYTO_BASE_NETWORK_NAME,
-    node_size_property: str = 'node_selection',
+    node_size_property: str = CYTO_SELECTION_PROPERTY,
    min_node_size: int = 15,
    max_node_size: int = 40,
    sandbox_name: str = CYTO_SANDBOX_NAME,