adjustment of skewed edge weights
This commit is contained in:
@@ -13,6 +13,7 @@ import numpy.typing as npt
|
||||
from networkx import DiGraph, Graph
|
||||
from pandas import DataFrame
|
||||
|
||||
from lang_main.constants import EDGE_WEIGHT_DECIMALS
|
||||
from lang_main.io import load_pickle, save_pickle
|
||||
from lang_main.loggers import logger_graphs as logger
|
||||
from lang_main.types import (
|
||||
@@ -105,15 +106,26 @@ def update_graph(
|
||||
def convert_graph_to_undirected(
|
||||
graph: DiGraph,
|
||||
logging: bool = LOGGING_DEFAULT,
|
||||
cast_int: bool = False,
|
||||
) -> Graph:
|
||||
dtype = np.float32
|
||||
if cast_int:
|
||||
dtype = np.uint32
|
||||
# get adjacency matrix
|
||||
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=np.uint32))
|
||||
arr = typing.cast(npt.NDArray[np.uint32], adj_mat.to_numpy())
|
||||
adj_mat = typing.cast(DataFrame, nx.to_pandas_adjacency(G=graph, dtype=dtype))
|
||||
arr = typing.cast(npt.NDArray[np.float32 | np.uint32], adj_mat.to_numpy())
|
||||
if not cast_int:
|
||||
arr = arr * (10**EDGE_WEIGHT_DECIMALS)
|
||||
arr = np.round(arr, decimals=0)
|
||||
arr = arr.astype(np.uint32)
|
||||
# build undirected array: adding edges of lower triangular matrix to upper one
|
||||
arr_upper = np.triu(arr)
|
||||
arr_lower = np.tril(arr)
|
||||
arr_lower = np.rot90(np.fliplr(arr_lower))
|
||||
arr_new = arr_upper + arr_lower
|
||||
if not cast_int:
|
||||
arr_new = (arr_new / 10**EDGE_WEIGHT_DECIMALS).astype(np.float32)
|
||||
arr_new = np.round(arr_new, decimals=EDGE_WEIGHT_DECIMALS)
|
||||
# assign new data and create graph
|
||||
adj_mat.loc[:] = arr_new # type: ignore
|
||||
graph_undir = typing.cast(Graph, nx.from_pandas_adjacency(df=adj_mat))
|
||||
@@ -164,6 +176,7 @@ def convert_graph_to_cytoscape(
|
||||
}
|
||||
cyto_data.append(edge_data)
|
||||
|
||||
# TODO: add internal behaviour (if edge added check for new min/max)
|
||||
min_weight: int = 0
|
||||
max_weight: int = 0
|
||||
if weights:
|
||||
@@ -257,6 +270,78 @@ def filter_graph_by_node_degree(
|
||||
return filtered_graph
|
||||
|
||||
|
||||
def normalise_array_linear(
|
||||
array: npt.NDArray[np.float_],
|
||||
) -> npt.NDArray[np.float32]:
|
||||
"""apply standard linear normalisation
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : npt.NDArray[np.float_]
|
||||
array which shall be normalised
|
||||
|
||||
Returns
|
||||
-------
|
||||
npt.NDArray[np.float32]
|
||||
min/max normalised array
|
||||
"""
|
||||
arr_norm = (array - array.min()) / (array.max() - array.min())
|
||||
|
||||
return arr_norm.astype(np.float32)
|
||||
|
||||
|
||||
def weight_scaling(
|
||||
weights: npt.NDArray[np.float32],
|
||||
a: float = 1.1,
|
||||
b: float = 0.05,
|
||||
) -> npt.NDArray[np.float32]:
|
||||
"""non-linear scaling of already normalised edge weights [0;1]: bigger weights
|
||||
have smaller weight delta than smaller weights. Bigger values für parameter
|
||||
`b` reinforce this effect.
|
||||
Based on:
|
||||
https://math.stackexchange.com/questions/4297805/exponential-function-that-passes-through-0-0-and-1-1-with-variable-slope
|
||||
|
||||
With default values the range of edge weights lies approximately between [0.1; 1]
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weights : npt.NDArray[np.float32]
|
||||
pre-normalised edge weights as 1D array
|
||||
a : float, optional
|
||||
factor to determine the value for edge weights with value 0
|
||||
with default approx. 0.1, by default 1.1
|
||||
b : float, optional
|
||||
adjust the curvature, smaller values increase it, by default 0.05
|
||||
|
||||
Returns
|
||||
-------
|
||||
npt.NDArray[np.float32]
|
||||
non-linear adjusted edge weights as 1D array
|
||||
"""
|
||||
adjusted_weights = (b**weights - a) / (b - a)
|
||||
|
||||
return np.round(adjusted_weights, decimals=EDGE_WEIGHT_DECIMALS)
|
||||
|
||||
|
||||
def rescale_edge_weights(
|
||||
graph: TokenGraph,
|
||||
) -> TokenGraph:
|
||||
graph = graph.copy()
|
||||
|
||||
weights = cast(list[int], [data['weight'] for data in graph.edges.values()])
|
||||
w_log = cast(npt.NDArray[np.float32], np.log(weights, dtype=np.float32))
|
||||
weights_norm = normalise_array_linear(w_log)
|
||||
weights_adjusted = weight_scaling(weights_norm)
|
||||
# assign new weight values
|
||||
for idx, (node_1, node_2) in enumerate(list(graph.edges)):
|
||||
graph[node_1][node_2]['weight'] = weights_adjusted[idx]
|
||||
|
||||
graph.rescaled_weights = True
|
||||
graph.update_metadata(logging=False)
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
# ** ---------------------------------------
|
||||
class TokenGraph(DiGraph):
|
||||
def __init__(
|
||||
@@ -276,6 +361,8 @@ class TokenGraph(DiGraph):
|
||||
self._metadata_directed: dict[str, int] = {}
|
||||
self._undirected: Graph | None = None
|
||||
self._metadata_undirected: dict[str, int] = {}
|
||||
# indicate rescaled weights
|
||||
self.rescaled_weights: bool = False
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
@@ -350,11 +437,17 @@ class TokenGraph(DiGraph):
|
||||
) -> Graph | None:
|
||||
if logging is None:
|
||||
logging = self.logging
|
||||
# cast to integer edge weights only if edges were not rescaled previously
|
||||
cast_int: bool = True
|
||||
if self.rescaled_weights:
|
||||
cast_int = False
|
||||
|
||||
self._undirected = convert_graph_to_undirected(graph=self, logging=logging)
|
||||
self._metadata_undirected = get_graph_metadata(
|
||||
graph=self._undirected, logging=logging
|
||||
self._undirected = convert_graph_to_undirected(
|
||||
graph=self,
|
||||
logging=logging,
|
||||
cast_int=cast_int,
|
||||
)
|
||||
self._metadata_undirected = get_graph_metadata(graph=self._undirected, logging=False)
|
||||
if not inplace:
|
||||
return self._undirected
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
|
||||
# ** token analysis
|
||||
|
||||
# ** graph postprocessing
|
||||
EDGE_WEIGHT_DECIMALS: Final[int] = 4
|
||||
THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
# ** time analysis.uniqueness
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||
|
||||
Reference in New Issue
Block a user