added new graph metrics
This commit is contained in:
@@ -16,12 +16,15 @@ from pandas import DataFrame
|
||||
from lang_main.constants import (
|
||||
EDGE_WEIGHT_DECIMALS,
|
||||
LOGGING_DEFAULT_GRAPHS,
|
||||
PROPERTY_NAME_BETWEENNESS_CENTRALITY,
|
||||
PROPERTY_NAME_DEGREE_WEIGHTED,
|
||||
PROPERTY_NAME_IMPORTANCE,
|
||||
)
|
||||
from lang_main.errors import (
|
||||
EdgePropertyNotContainedError,
|
||||
EmptyEdgesError,
|
||||
EmptyGraphError,
|
||||
NodePropertyNotContainedError,
|
||||
)
|
||||
from lang_main.io import load_pickle, save_pickle
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
@@ -310,15 +313,98 @@ def add_weighted_degree(
|
||||
property of the edges which contains the weight information, by default 'weight'
|
||||
property_name : str, optional
|
||||
target name for the property containing the weighted degree in nodes,
|
||||
by default 'degree_weighted'
|
||||
by default PROPERTY_NAME_DEGREE_WEIGHTED
|
||||
"""
|
||||
node_degree_mapping = cast(
|
||||
node_property_mapping = cast(
|
||||
dict[str, float],
|
||||
dict(graph.degree(weight=edge_weight_property)), # type: ignore
|
||||
)
|
||||
nx.set_node_attributes(
|
||||
graph,
|
||||
node_degree_mapping,
|
||||
node_property_mapping,
|
||||
name=property_name,
|
||||
)
|
||||
|
||||
|
||||
def add_betweenness_centrality(
|
||||
graph: DiGraph | Graph,
|
||||
edge_weight_property: str | None = None,
|
||||
property_name: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
|
||||
) -> None:
|
||||
"""adds the betweenness centrality as property to each node of the given graph
|
||||
Operation is performed inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : DiGraph | Graph
|
||||
Graph with betweenness centrality as node property added inplace
|
||||
edge_weight_property : str | None, optional
|
||||
property of the edges which contains the weight information,
|
||||
not necessarily needed, by default 'None'
|
||||
property_name : str, optional
|
||||
target name for the property containing the betweenness centrality in nodes,
|
||||
by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
|
||||
"""
|
||||
|
||||
node_property_mapping = cast(
|
||||
dict[str, float],
|
||||
nx.betweenness_centrality(graph, normalized=True, weight=edge_weight_property), # type: ignore
|
||||
)
|
||||
nx.set_node_attributes(
|
||||
graph,
|
||||
node_property_mapping,
|
||||
name=property_name,
|
||||
)
|
||||
|
||||
|
||||
def add_importance_metric(
|
||||
graph: DiGraph | Graph,
|
||||
property_name: str = PROPERTY_NAME_IMPORTANCE,
|
||||
property_name_weighted_degree: str = PROPERTY_NAME_DEGREE_WEIGHTED,
|
||||
property_name_betweenness: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY,
|
||||
) -> None:
|
||||
"""Adds a custom importance metric as property to each node of the given graph.
|
||||
Can be used to decide which nodes are of high importance and also to build node size
|
||||
mappings.
|
||||
Operation is performed inplace.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : DiGraph | Graph
|
||||
Graph with weighted degree as node property added inplace
|
||||
property_name : str, optional
|
||||
target name for the property containing the weighted degree in nodes,
|
||||
by default PROPERTY_NAME_DEGREE_WEIGHTED
|
||||
property_name_betweenness : str, optional
|
||||
target name for the property containing the betweenness centrality in nodes,
|
||||
by default PROPERTY_NAME_BETWEENNESS_CENTRALITY
|
||||
"""
|
||||
# build mapping for importance metric
|
||||
node_property_mapping: dict[str, float] = {}
|
||||
for node in cast(Iterable[str], graph.nodes):
|
||||
node_data = cast(dict[str, float], graph.nodes[node])
|
||||
|
||||
if property_name_weighted_degree not in node_data:
|
||||
raise NodePropertyNotContainedError(
|
||||
(
|
||||
f'Node data does not contain weighted degree '
|
||||
f'with name {property_name_weighted_degree}.'
|
||||
)
|
||||
)
|
||||
elif property_name_betweenness not in node_data:
|
||||
raise NodePropertyNotContainedError(
|
||||
(
|
||||
f'Node data does not contain betweenness centrality '
|
||||
f'with name {property_name_betweenness}.'
|
||||
)
|
||||
)
|
||||
|
||||
prio = node_data[property_name_weighted_degree] * node_data[property_name_betweenness]
|
||||
node_property_mapping[node] = prio
|
||||
|
||||
nx.set_node_attributes(
|
||||
graph,
|
||||
node_property_mapping,
|
||||
name=property_name,
|
||||
)
|
||||
|
||||
@@ -351,6 +437,8 @@ def pipe_add_graph_metrics(
|
||||
for graph in graphs:
|
||||
graph_copy = copy.deepcopy(graph)
|
||||
add_weighted_degree(graph_copy)
|
||||
add_betweenness_centrality(graph_copy)
|
||||
add_importance_metric(graph_copy)
|
||||
collection.append(graph_copy)
|
||||
|
||||
return tuple(collection)
|
||||
@@ -762,19 +850,3 @@ class TokenGraph(DiGraph):
|
||||
raise ValueError('File format not supported.')
|
||||
|
||||
return graph
|
||||
|
||||
# TODO check removal
|
||||
# @classmethod
|
||||
# def from_pickle(
|
||||
# cls,
|
||||
# path: str | Path,
|
||||
# ) -> Self:
|
||||
# if isinstance(path, str):
|
||||
# path = Path(path)
|
||||
|
||||
# if path.suffix not in ('.pkl', '.pickle'):
|
||||
# raise ValueError('File format not supported.')
|
||||
|
||||
# graph = typing.cast(Self, load_pickle(path))
|
||||
|
||||
# return graph
|
||||
|
||||
@@ -19,7 +19,7 @@ except ImportError:
|
||||
|
||||
# ** external packages config
|
||||
# ** Huggingface Hub caching
|
||||
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
|
||||
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
|
||||
|
||||
# ** py4cytoscape config
|
||||
if _has_py4cyto:
|
||||
@@ -36,7 +36,7 @@ BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang-
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
|
||||
PKG_DIR: Final[Path] = Path(__file__).parent
|
||||
STOP_FOLDER: Final[str] = 'python'
|
||||
STOP_FOLDER: Final[str] = os.environ.get('LANG_MAIN_STOP_SEARCH_FOLDERNAME', 'src')
|
||||
|
||||
|
||||
def load_toml_config(
|
||||
@@ -65,6 +65,7 @@ def load_cfg(
|
||||
starting_path: Path,
|
||||
glob_pattern: str,
|
||||
stop_folder_name: str | None,
|
||||
lookup_cwd: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
"""Look for configuration file. Internal configs are not used any more because
|
||||
the library behaviour is only guaranteed by external configurations.
|
||||
@@ -91,9 +92,10 @@ def load_cfg(
|
||||
LangMainConfigNotFoundError
|
||||
if no config file was found
|
||||
"""
|
||||
cfg_path: Path | None
|
||||
print('Looking for cfg file in CWD.', flush=True)
|
||||
cfg_path = search_cwd(glob_pattern)
|
||||
cfg_path: Path | None = None
|
||||
if lookup_cwd:
|
||||
print('Looking for cfg file in CWD.', flush=True)
|
||||
cfg_path = search_cwd(glob_pattern)
|
||||
|
||||
if cfg_path is None:
|
||||
print(
|
||||
|
||||
@@ -54,17 +54,13 @@ PICKLE_PROTOCOL_VERSION: Final[int] = 5
|
||||
# config placed in library path of application (usually "bin")
|
||||
input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs'])
|
||||
INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve()
|
||||
# TODO reactivate later
|
||||
if not INPUT_PATH_FOLDER.exists():
|
||||
if not INPUT_PATH_FOLDER.exists(): # pragma: no cover
|
||||
raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
|
||||
save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results'])
|
||||
SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve()
|
||||
if not SAVE_PATH_FOLDER.exists():
|
||||
if not SAVE_PATH_FOLDER.exists(): # pragma: no cover
|
||||
raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.')
|
||||
path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset'])
|
||||
PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve()
|
||||
# if not PATH_TO_DATASET.exists():
|
||||
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
|
||||
|
||||
# ** control
|
||||
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
||||
@@ -82,22 +78,34 @@ MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve()
|
||||
if not MODEL_BASE_FOLDER.exists():
|
||||
raise FileNotFoundError('Language model folder not found.')
|
||||
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
|
||||
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM
|
||||
STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
|
||||
|
||||
# LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
|
||||
# LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
|
||||
# LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
|
||||
# LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
|
||||
|
||||
SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = os.environ.get(
|
||||
'LANG_MAIN_SPACY_MODEL', SpacyModelTypes.DE_CORE_NEWS_SM
|
||||
)
|
||||
STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get(
|
||||
'LANG_MAIN_STFR_MODEL', STFRModelTypes.ALL_MPNET_BASE_V2
|
||||
)
|
||||
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
||||
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
|
||||
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
|
||||
STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
|
||||
STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
|
||||
STFR_BACKEND: Final[str | STFRBackends] = os.environ.get(
|
||||
'LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH
|
||||
)
|
||||
stfr_model_args_default: STFRModelArgs = {}
|
||||
stfr_model_args_onnx: STFRModelArgs = {
|
||||
'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
|
||||
'provider': ONNXExecutionProvider.CPU,
|
||||
'export': False,
|
||||
}
|
||||
stfr_model_args: STFRModelArgs
|
||||
if STFR_BACKEND == STFRBackends.ONNX:
|
||||
stfr_model_args = STFR_MODEL_ARGS_ONNX
|
||||
stfr_model_args = stfr_model_args_onnx
|
||||
else:
|
||||
stfr_model_args = STFR_MODEL_ARGS_DEFAULT
|
||||
stfr_model_args = stfr_model_args_default
|
||||
|
||||
STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
|
||||
# ** language dependency analysis
|
||||
@@ -122,6 +130,8 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
|
||||
EDGE_WEIGHT_DECIMALS: Final[int] = 4
|
||||
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
|
||||
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
|
||||
PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality'
|
||||
PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance'
|
||||
|
||||
# ** graph exports (Cytoscape)
|
||||
CYTO_MAX_NODE_COUNT: Final[int] = 500
|
||||
|
||||
6
src/lang_main/env_vars.txt
Normal file
6
src/lang_main/env_vars.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
# list of all library's environment variables
|
||||
LANG_MAIN_STOP_SEARCH_FOLDERNAME : foldername in package directory tree at which the lookup should stop; used to find directory root
|
||||
LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
|
||||
LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
|
||||
LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
|
||||
LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
|
||||
@@ -8,6 +8,10 @@ class LanguageModelNotFoundError(Exception):
|
||||
|
||||
|
||||
# ** token graph exceptions
|
||||
class NodePropertyNotContainedError(Exception):
|
||||
"""Error raised if a needed node property is not contained in graph edges"""
|
||||
|
||||
|
||||
class EdgePropertyNotContainedError(Exception):
|
||||
"""Error raised if a needed edge property is not contained in graph edges"""
|
||||
|
||||
|
||||
@@ -1,22 +1,18 @@
|
||||
# lang_main: Config file
|
||||
[info]
|
||||
pkg = 'lang_main_internal'
|
||||
|
||||
[paths]
|
||||
inputs = './data/in/'
|
||||
inputs = '../data/in/'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './data/out/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
models = '../../lang-models'
|
||||
results = '../data/out/'
|
||||
models = './lang-models'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
# control which pipelines are executed
|
||||
[control]
|
||||
preprocessing_skip = false
|
||||
token_analysis_skip = false
|
||||
|
||||
@@ -33,7 +33,7 @@ if ENABLE_LOGGING and LOGGING_TO_STDERR:
|
||||
logger_all_handler_stderr = logging.StreamHandler()
|
||||
logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR)
|
||||
logger_all_handler_stderr.setFormatter(logger_all_formater)
|
||||
else:
|
||||
else: # pragma: no cover
|
||||
logger_all_handler_stderr = null_handler
|
||||
|
||||
if ENABLE_LOGGING and LOGGING_TO_FILE:
|
||||
@@ -45,7 +45,7 @@ if ENABLE_LOGGING and LOGGING_TO_FILE:
|
||||
)
|
||||
logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE)
|
||||
logger_all_handler_file.setFormatter(logger_all_formater)
|
||||
else:
|
||||
else: # pragma: no cover
|
||||
logger_all_handler_file = null_handler
|
||||
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ class BasePipeline(ABC):
|
||||
# container for actions to perform during pass
|
||||
self.actions: list[Callable] = []
|
||||
self.action_names: list[str] = []
|
||||
self.action_skip: list[bool] = []
|
||||
# progress tracking, start at 1
|
||||
self.curr_proc_idx: int = 1
|
||||
|
||||
@@ -104,8 +105,6 @@ class PipelineContainer(BasePipeline):
|
||||
) -> None:
|
||||
super().__init__(name=name, working_dir=working_dir)
|
||||
|
||||
self.action_skip: list[bool] = []
|
||||
|
||||
@override
|
||||
def add(
|
||||
self,
|
||||
@@ -170,6 +169,7 @@ class Pipeline(BasePipeline):
|
||||
self,
|
||||
action: Callable,
|
||||
action_kwargs: dict[str, Any] | None = None,
|
||||
skip: bool = False,
|
||||
save_result: bool = False,
|
||||
load_result: bool = False,
|
||||
filename: str | None = None,
|
||||
@@ -183,6 +183,7 @@ class Pipeline(BasePipeline):
|
||||
self.actions.append(action)
|
||||
self.action_names.append(action.__name__)
|
||||
self.actions_kwargs.append(action_kwargs.copy())
|
||||
self.action_skip.append(skip)
|
||||
self.save_results.append((save_result, filename))
|
||||
self.load_results.append((load_result, filename))
|
||||
else:
|
||||
@@ -235,7 +236,13 @@ class Pipeline(BasePipeline):
|
||||
self,
|
||||
starting_values: tuple[Any, ...] | None = None,
|
||||
) -> tuple[Any, ...]:
|
||||
first_performed: bool = False
|
||||
|
||||
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
|
||||
if self.action_skip[idx]:
|
||||
self.curr_proc_idx += 1
|
||||
continue
|
||||
|
||||
# loading
|
||||
if self.load_results[idx][0]:
|
||||
filename = self.load_results[idx][1]
|
||||
@@ -248,8 +255,9 @@ class Pipeline(BasePipeline):
|
||||
self.curr_proc_idx += 1
|
||||
continue
|
||||
# calculation
|
||||
if idx == 0:
|
||||
if not first_performed:
|
||||
args = starting_values
|
||||
first_performed = True
|
||||
else:
|
||||
args = ret
|
||||
|
||||
|
||||
@@ -296,7 +296,7 @@ def apply_style_to_network(
|
||||
style_name: str = CYTO_STYLESHEET_NAME,
|
||||
pth_to_stylesheet: Path = CYTO_PATH_STYLESHEET,
|
||||
network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||
node_size_property: str = 'node_selection',
|
||||
node_size_property: str = CYTO_SELECTION_PROPERTY,
|
||||
min_node_size: int = 15,
|
||||
max_node_size: int = 40,
|
||||
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||
|
||||
Reference in New Issue
Block a user