added new graph metrics

This commit is contained in:
Florian Förster
2024-12-19 16:26:01 +01:00
parent 123869e203
commit 80a35c4658
24 changed files with 826 additions and 97 deletions

View File

@@ -16,12 +16,15 @@ from pandas import DataFrame
from lang_main.constants import (
EDGE_WEIGHT_DECIMALS,
LOGGING_DEFAULT_GRAPHS,
PROPERTY_NAME_BETWEENNESS_CENTRALITY,
PROPERTY_NAME_DEGREE_WEIGHTED,
PROPERTY_NAME_IMPORTANCE,
)
from lang_main.errors import (
EdgePropertyNotContainedError,
EmptyEdgesError,
EmptyGraphError,
NodePropertyNotContainedError,
)
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_graphs as logger
@@ -310,15 +313,98 @@ def add_weighted_degree(
property of the edges which contains the weight information, by default 'weight'
property_name : str, optional
target name for the property containing the weighted degree in nodes,
by default 'degree_weighted'
by default PROPERTY_NAME_DEGREE_WEIGHTED
"""
node_degree_mapping = cast(
node_property_mapping = cast(
dict[str, float],
dict(graph.degree(weight=edge_weight_property)), # type: ignore
)
nx.set_node_attributes(
graph,
node_degree_mapping,
node_property_mapping,
name=property_name,
)
def add_betweenness_centrality(
graph: DiGraph | Graph,
edge_weight_property: str | None = None,
property_name: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
) -> None:
"""adds the betweenness centrality as property to each node of the given graph
Operation is performed inplace.
Parameters
----------
graph : DiGraph | Graph
Graph with betweenness centrality as node property added inplace
edge_weight_property : str | None, optional
property of the edges which contains the weight information,
not necessarily needed, by default 'None'
property_name : str, optional
target name for the property containing the betweenness centrality in nodes,
by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
"""
node_property_mapping = cast(
dict[str, float],
nx.betweenness_centrality(graph, normalized=True, weight=edge_weight_property), # type: ignore
)
nx.set_node_attributes(
graph,
node_property_mapping,
name=property_name,
)
def add_importance_metric(
graph: DiGraph | Graph,
property_name: str = PROPERTY_NAME_IMPORTANCE,
property_name_weighted_degree: str = PROPERTY_NAME_DEGREE_WEIGHTED,
property_name_betweenness: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
) -> None:
"""Adds a custom importance metric as property to each node of the given graph.
Can be used to decide which nodes are of high importance and also to build node size
mappings.
Operation is performed inplace.
Parameters
----------
graph : DiGraph | Graph
Graph with weighted degree as node property added inplace
property_name : str, optional
target name for the property containing the weighted degree in nodes,
by default PROPERTY_NAME_DEGREE_WEIGHTED
property_name_betweenness : str, optional
target name for the property containing the betweenness centrality in nodes,
by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
"""
# build mapping for importance metric
node_property_mapping: dict[str, float] = {}
for node in cast(Iterable[str], graph.nodes):
node_data = cast(dict[str, float], graph.nodes[node])
if property_name_weighted_degree not in node_data:
raise NodePropertyNotContainedError(
(
f'Node data does not contain weighted degree '
f'with name {property_name_weighted_degree}.'
)
)
elif property_name_betweenness not in node_data:
raise NodePropertyNotContainedError(
(
f'Node data does not contain betweenness centrality '
f'with name {property_name_betweenness}.'
)
)
prio = node_data[property_name_weighted_degree] * node_data[property_name_betweenness]
node_property_mapping[node] = prio
nx.set_node_attributes(
graph,
node_property_mapping,
name=property_name,
)
@@ -351,6 +437,8 @@ def pipe_add_graph_metrics(
for graph in graphs:
graph_copy = copy.deepcopy(graph)
add_weighted_degree(graph_copy)
add_betweenness_centrality(graph_copy)
add_importance_metric(graph_copy)
collection.append(graph_copy)
return tuple(collection)
@@ -762,19 +850,3 @@ class TokenGraph(DiGraph):
raise ValueError('File format not supported.')
return graph
# TODO check removal
# @classmethod
# def from_pickle(
# cls,
# path: str | Path,
# ) -> Self:
# if isinstance(path, str):
# path = Path(path)
# if path.suffix not in ('.pkl', '.pickle'):
# raise ValueError('File format not supported.')
# graph = typing.cast(Self, load_pickle(path))
# return graph

View File

@@ -19,7 +19,7 @@ except ImportError:
# ** external packages config
# ** Huggingface Hub caching
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
# ** py4cytoscape config
if _has_py4cyto:
@@ -36,7 +36,7 @@ BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang-
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
PKG_DIR: Final[Path] = Path(__file__).parent
STOP_FOLDER: Final[str] = 'python'
STOP_FOLDER: Final[str] = os.environ.get('LANG_MAIN_STOP_SEARCH_FOLDERNAME', 'src')
def load_toml_config(
@@ -65,6 +65,7 @@ def load_cfg(
starting_path: Path,
glob_pattern: str,
stop_folder_name: str | None,
lookup_cwd: bool = False,
) -> dict[str, Any]:
"""Look for configuration file. Internal configs are not used any more because
the library behaviour is only guaranteed by external configurations.
@@ -91,9 +92,10 @@ def load_cfg(
LangMainConfigNotFoundError
if no config file was found
"""
cfg_path: Path | None
print('Looking for cfg file in CWD.', flush=True)
cfg_path = search_cwd(glob_pattern)
cfg_path: Path | None = None
if lookup_cwd:
print('Looking for cfg file in CWD.', flush=True)
cfg_path = search_cwd(glob_pattern)
if cfg_path is None:
print(

View File

@@ -54,17 +54,13 @@ PICKLE_PROTOCOL_VERSION: Final[int] = 5
# config placed in library path of application (usually "bin")
input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs'])
INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve()
# TODO reactivate later
if not INPUT_PATH_FOLDER.exists():
if not INPUT_PATH_FOLDER.exists(): # pragma: no cover
raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results'])
SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve()
if not SAVE_PATH_FOLDER.exists():
if not SAVE_PATH_FOLDER.exists(): # pragma: no cover
raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.')
path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset'])
PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve()
# if not PATH_TO_DATASET.exists():
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
# ** control
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
@@ -82,22 +78,34 @@ MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve()
if not MODEL_BASE_FOLDER.exists():
raise FileNotFoundError('Language model folder not found.')
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM
STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
# LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
# LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
# LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
# LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = os.environ.get(
'LANG_MAIN_SPACY_MODEL', SpacyModelTypes.DE_CORE_NEWS_SM
)
STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get(
'LANG_MAIN_STFR_MODEL', STFRModelTypes.ALL_MPNET_BASE_V2
)
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
STFR_BACKEND: Final[str | STFRBackends] = os.environ.get(
'LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH
)
stfr_model_args_default: STFRModelArgs = {}
stfr_model_args_onnx: STFRModelArgs = {
'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
'provider': ONNXExecutionProvider.CPU,
'export': False,
}
stfr_model_args: STFRModelArgs
if STFR_BACKEND == STFRBackends.ONNX:
stfr_model_args = STFR_MODEL_ARGS_ONNX
stfr_model_args = stfr_model_args_onnx
else:
stfr_model_args = STFR_MODEL_ARGS_DEFAULT
stfr_model_args = stfr_model_args_default
STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
# ** language dependency analysis
@@ -122,6 +130,8 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
EDGE_WEIGHT_DECIMALS: Final[int] = 4
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality'
PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance'
# ** graph exports (Cytoscape)
CYTO_MAX_NODE_COUNT: Final[int] = 500

View File

@@ -0,0 +1,6 @@
# list of all library's environment variables
LANG_MAIN_STOP_SEARCH_FOLDERNAME : foldername in package directory tree at which the lookup should stop; used to find directory root
LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"

View File

@@ -8,6 +8,10 @@ class LanguageModelNotFoundError(Exception):
# ** token graph exceptions
class NodePropertyNotContainedError(Exception):
"""Error raised if a needed node property is not contained in graph edges"""
class EdgePropertyNotContainedError(Exception):
"""Error raised if a needed edge property is not contained in graph edges"""

View File

@@ -1,22 +1,18 @@
# lang_main: Config file
[info]
pkg = 'lang_main_internal'
[paths]
inputs = './data/in/'
inputs = '../data/in/'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './data/out/'
dataset = '../data/02_202307/Export4.csv'
models = '../../lang-models'
results = '../data/out/'
models = './lang-models'
[logging]
enabled = true
stderr = true
file = true
# only debugging features, production-ready pipelines should always
# be fully executed
# control which pipelines are executed
[control]
preprocessing_skip = false
token_analysis_skip = false

View File

@@ -33,7 +33,7 @@ if ENABLE_LOGGING and LOGGING_TO_STDERR:
logger_all_handler_stderr = logging.StreamHandler()
logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR)
logger_all_handler_stderr.setFormatter(logger_all_formater)
else:
else: # pragma: no cover
logger_all_handler_stderr = null_handler
if ENABLE_LOGGING and LOGGING_TO_FILE:
@@ -45,7 +45,7 @@ if ENABLE_LOGGING and LOGGING_TO_FILE:
)
logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE)
logger_all_handler_file.setFormatter(logger_all_formater)
else:
else: # pragma: no cover
logger_all_handler_file = null_handler

View File

@@ -33,6 +33,7 @@ class BasePipeline(ABC):
# container for actions to perform during pass
self.actions: list[Callable] = []
self.action_names: list[str] = []
self.action_skip: list[bool] = []
# progress tracking, start at 1
self.curr_proc_idx: int = 1
@@ -104,8 +105,6 @@ class PipelineContainer(BasePipeline):
) -> None:
super().__init__(name=name, working_dir=working_dir)
self.action_skip: list[bool] = []
@override
def add(
self,
@@ -170,6 +169,7 @@ class Pipeline(BasePipeline):
self,
action: Callable,
action_kwargs: dict[str, Any] | None = None,
skip: bool = False,
save_result: bool = False,
load_result: bool = False,
filename: str | None = None,
@@ -183,6 +183,7 @@ class Pipeline(BasePipeline):
self.actions.append(action)
self.action_names.append(action.__name__)
self.actions_kwargs.append(action_kwargs.copy())
self.action_skip.append(skip)
self.save_results.append((save_result, filename))
self.load_results.append((load_result, filename))
else:
@@ -235,7 +236,13 @@ class Pipeline(BasePipeline):
self,
starting_values: tuple[Any, ...] | None = None,
) -> tuple[Any, ...]:
first_performed: bool = False
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
if self.action_skip[idx]:
self.curr_proc_idx += 1
continue
# loading
if self.load_results[idx][0]:
filename = self.load_results[idx][1]
@@ -248,8 +255,9 @@ class Pipeline(BasePipeline):
self.curr_proc_idx += 1
continue
# calculation
if idx == 0:
if not first_performed:
args = starting_values
first_performed = True
else:
args = ret

View File

@@ -296,7 +296,7 @@ def apply_style_to_network(
style_name: str = CYTO_STYLESHEET_NAME,
pth_to_stylesheet: Path = CYTO_PATH_STYLESHEET,
network_name: str = CYTO_BASE_NETWORK_NAME,
node_size_property: str = 'node_selection',
node_size_property: str = CYTO_SELECTION_PROPERTY,
min_node_size: int = 15,
max_node_size: int = 40,
sandbox_name: str = CYTO_SANDBOX_NAME,