adjustment of skewed edge weights

This commit is contained in:
Florian Förster
2024-07-03 18:01:33 +02:00
parent 2656780907
commit 55a9ef7f69
5 changed files with 853 additions and 134 deletions

View File

@@ -13,6 +13,7 @@ import numpy.typing as npt
from networkx import DiGraph, Graph
from pandas import DataFrame
from lang_main.constants import EDGE_WEIGHT_DECIMALS
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_graphs as logger
from lang_main.types import (
@@ -105,15 +106,26 @@ def update_graph(
def convert_graph_to_undirected(
graph: DiGraph,
logging: bool = LOGGING_DEFAULT,
cast_int: bool = False,
) -> Graph:
dtype = np.float32
if cast_int:
dtype = np.uint32
# get adjacency matrix
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=dtype))
arr = typing.cast(npt.NDArray[np.float32 | np.uint32], adj_mat.to_numpy())
if not cast_int:
arr = arr * (10**EDGE_WEIGHT_DECIMALS)
arr = np.round(arr, decimals=0)
arr = arr.astype(np.uint32)
# build undirected array: adding edges of lower triangular matrix to upper one
arr_upper = np.triu(arr)
arr_lower = np.tril(arr)
arr_lower = np.rot90(np.fliplr(arr_lower))
arr_new = arr_upper + arr_lower
if not cast_int:
arr_new = (arr_new / 10**EDGE_WEIGHT_DECIMALS).astype(np.float32)
arr_new = np.round(arr_new, decimals=EDGE_WEIGHT_DECIMALS)
# assign new data and create graph
adj_mat.loc[:] = arr_new # type: ignore
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
@@ -164,6 +176,7 @@ def convert_graph_to_cytoscape(
}
cyto_data.append(edge_data)
# TODO: add internal behaviour (if edge added check for new min/max)
min_weight: int = 0
max_weight: int = 0
if weights:
@@ -257,6 +270,78 @@ def filter_graph_by_node_degree(
return filtered_graph
def normalise_array_linear(
array: npt.NDArray[np.float_],
) -> npt.NDArray[np.float32]:
"""apply standard linear normalisation
Parameters
----------
array : npt.NDArray[np.float_]
array which shall be normalised
Returns
-------
npt.NDArray[np.float32]
min/max normalised array
"""
arr_norm = (array - array.min()) / (array.max() - array.min())
return arr_norm.astype(np.float32)
def weight_scaling(
weights: npt.NDArray[np.float32],
a: float = 1.1,
b: float = 0.05,
) -> npt.NDArray[np.float32]:
"""non-linear scaling of already normalised edge weights [0;1]: bigger weights
have smaller weight delta than smaller weights. Bigger values für parameter
`b` reinforce this effect.
Based on:
https://math.stackexchange.com/questions/4297805/exponential-function-that-passes-through-0-0-and-1-1-with-variable-slope
With default values the range of edge weights lies approximately between [0.1; 1]
Parameters
----------
weights : npt.NDArray[np.float32]
pre-normalised edge weights as 1D array
a : float, optional
factor to determine the value for edge weights with value 0
with default approx. 0.1, by default 1.1
b : float, optional
adjust the curvature, smaller values increase it, by default 0.05
Returns
-------
npt.NDArray[np.float32]
non-linear adjusted edge weights as 1D array
"""
adjusted_weights = (b**weights - a) / (b - a)
return np.round(adjusted_weights, decimals=EDGE_WEIGHT_DECIMALS)
def rescale_edge_weights(
graph: TokenGraph,
) -> TokenGraph:
graph = graph.copy()
weights = cast(list[int], [data['weight'] for data in graph.edges.values()])
w_log = cast(npt.NDArray[np.float32], np.log(weights, dtype=np.float32))
weights_norm = normalise_array_linear(w_log)
weights_adjusted = weight_scaling(weights_norm)
# assign new weight values
for idx, (node_1, node_2) in enumerate(list(graph.edges)):
graph[node_1][node_2]['weight'] = weights_adjusted[idx]
graph.rescaled_weights = True
graph.update_metadata(logging=False)
return graph
# ** ---------------------------------------
class TokenGraph(DiGraph):
def __init__(
@@ -276,6 +361,8 @@ class TokenGraph(DiGraph):
self._metadata_directed: dict[str, int] = {}
self._undirected: Graph | None = None
self._metadata_undirected: dict[str, int] = {}
# indicate rescaled weights
self.rescaled_weights: bool = False
def __repr__(self) -> str:
return self.__str__()
@@ -350,11 +437,17 @@ class TokenGraph(DiGraph):
) -> Graph | None:
if logging is None:
logging = self.logging
# cast to integer edge weights only if edges were not rescaled previously
cast_int: bool = True
if self.rescaled_weights:
cast_int = False
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
self._metadata_undirected = get_graph_metadata(
graph=self._undirected, logging=logging
self._undirected = convert_graph_to_undirected(
graph=self,
logging=logging,
cast_int=cast_int,
)
self._metadata_undirected = get_graph_metadata(graph=self._undirected, logging=False)
if not inplace:
return self._undirected

View File

@@ -56,6 +56,7 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
# ** token analysis
# ** graph postprocessing
EDGE_WEIGHT_DECIMALS: Final[int] = 4
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
# ** time analysis.uniqueness
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][