230 lines
8.8 KiB
Python
230 lines
8.8 KiB
Python
from enum import Enum # noqa: I001
|
|
from importlib.util import find_spec
|
|
from pathlib import Path
|
|
from typing import Final, cast
|
|
import os
|
|
|
|
from sentence_transformers import SimilarityFunction
|
|
|
|
from lang_main import (
|
|
CONFIG,
|
|
CYTO_PATH_STYLESHEET,
|
|
LIB_PATH,
|
|
)
|
|
from lang_main.types import (
|
|
CytoLayoutProperties,
|
|
CytoLayouts,
|
|
ONNXExecutionProvider, # noqa: F401
|
|
STFRBackends,
|
|
STFRDeviceTypes,
|
|
STFRModelArgs,
|
|
STFRModelTypes,
|
|
STFRONNXFilenames, # noqa: F401
|
|
SpacyModelTypes,
|
|
)
|
|
|
|
__all__ = [
|
|
'CONFIG',
|
|
'CYTO_PATH_STYLESHEET',
|
|
]
|
|
|
|
# ** dependencies
|
|
_has_py4cyto: bool = True if find_spec('py4cytoscape') else False
|
|
_has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False
|
|
_has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False
|
|
|
|
|
|
class Dependencies(Enum):
|
|
PY4C = _has_py4cyto
|
|
DASH = _has_dash
|
|
PLOT = _has_plotly
|
|
|
|
|
|
# ** logging
|
|
# graphs
|
|
ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled']
|
|
LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
|
|
LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
|
|
LOGGING_DEFAULT_GRAPHS: Final[bool] = False
|
|
|
|
# ** pickling
|
|
PICKLE_PROTOCOL_VERSION: Final[int] = 5
|
|
|
|
# ** paths
|
|
# config placed in library path of application (usually "bin")
|
|
input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs'])
|
|
INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve()
|
|
if not INPUT_PATH_FOLDER.exists(): # pragma: no cover
|
|
raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
|
|
save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results'])
|
|
SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve()
|
|
if not SAVE_PATH_FOLDER.exists(): # pragma: no cover
|
|
raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.')
|
|
TK_GRAPH_EXPORT_FILENAME: Final[str] = CONFIG['paths']['graph_export_filename']
|
|
|
|
# ** control
|
|
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
|
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
|
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
|
SKIP_GRAPH_RESCALING: Final[bool] = CONFIG['control']['graph_rescaling_skip']
|
|
SKIP_GRAPH_STATIC_RENDERING: Final[bool] = CONFIG['control']['graph_static_rendering_skip']
|
|
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
|
|
|
|
|
# ** models
|
|
# ** loading
|
|
model_folder_cfg = LIB_PATH / Path(CONFIG['paths']['models'])
|
|
MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve()
|
|
if not MODEL_BASE_FOLDER.exists():
|
|
raise FileNotFoundError('Language model folder not found.')
|
|
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
|
|
|
|
# LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found)
|
|
# LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use
|
|
# LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use
|
|
# LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx"
|
|
|
|
# config option for switch between spaCy TRF und medium model
|
|
# env variable has prio: if set, use this
|
|
SPACY_USE_LARGE_MODEL: Final[bool] = CONFIG['models']['use_large_model']
|
|
spacy_model_name: str | SpacyModelTypes | None
|
|
spacy_model_name = os.environ.get('LANG_MAIN_SPACY_MODEL', None)
|
|
if spacy_model_name is None:
|
|
if SPACY_USE_LARGE_MODEL:
|
|
spacy_model_name = SpacyModelTypes.DE_DEP_NEWS_TRF
|
|
else:
|
|
spacy_model_name = SpacyModelTypes.DE_CORE_NEWS_MD
|
|
|
|
SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = spacy_model_name
|
|
STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get(
|
|
'LANG_MAIN_STFR_MODEL', STFRModelTypes.E5_BASE_STS_EN_DE
|
|
)
|
|
STFR_CUSTOM_MODELS: Final[dict[tuple[STFRModelTypes, STFRBackends], bool]] = {
|
|
(STFRModelTypes.E5_BASE_STS_EN_DE, STFRBackends.ONNX): True,
|
|
}
|
|
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
|
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
|
|
STFR_BACKEND: Final[STFRBackends] = cast(
|
|
STFRBackends, os.environ.get('LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH)
|
|
)
|
|
stfr_model_args_default: STFRModelArgs = {'torch_dtype': 'float32'}
|
|
stfr_model_args_onnx: STFRModelArgs = {
|
|
'file_name': STFRONNXFilenames.ONNX_Q_UINT8,
|
|
'provider': ONNXExecutionProvider.CPU,
|
|
'export': False,
|
|
}
|
|
stfr_model_args: STFRModelArgs
|
|
if STFR_BACKEND == STFRBackends.ONNX:
|
|
stfr_model_args = stfr_model_args_onnx
|
|
else:
|
|
stfr_model_args = stfr_model_args_default
|
|
|
|
STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
|
|
# ** language dependency analysis
|
|
# ** POS
|
|
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV', 'NUM'])
|
|
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
|
# ** TAG
|
|
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
|
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
|
|
|
|
|
# ** export
|
|
# ** preprocessing
|
|
DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
|
|
TARGET_FEATURE: Final[str] = CONFIG['preprocess']['target_feature']
|
|
threshold_amount_characters: int = CONFIG['preprocess']['threshold_amount_characters']
|
|
if threshold_amount_characters < 0:
|
|
threshold_amount_characters = 0
|
|
THRESHOLD_AMOUNT_CHARACTERS: Final[int] = threshold_amount_characters
|
|
threshold_similarity: float = CONFIG['preprocess']['threshold_similarity']
|
|
if threshold_similarity < 0 or threshold_similarity > 1:
|
|
raise ValueError(
|
|
(
|
|
'[CONFIG][preprocess][threshold_similarity] Preprocessing similarity '
|
|
'threshold must be between 0 and 1.'
|
|
)
|
|
)
|
|
THRESHOLD_SIMILARITY: Final[float] = threshold_similarity
|
|
# ** token analysis
|
|
|
|
# ** graph postprocessing
|
|
ENABLE_EDGE_RESCALING: Final[bool] = CONFIG['graph_postprocessing']['enable_edge_rescaling']
|
|
EDGE_WEIGHT_DECIMALS: Final[int] = 6
|
|
max_edge_number: int | None = None
|
|
max_edge_number_cfg: int = CONFIG['graph_postprocessing']['max_edge_number']
|
|
if max_edge_number_cfg >= 0:
|
|
max_edge_number = max_edge_number_cfg
|
|
MAX_EDGE_NUMBER: Final[int | None] = max_edge_number
|
|
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
|
|
PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality'
|
|
PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance'
|
|
|
|
# ** graph exports (Cytoscape)
|
|
CYTO_MAX_NODE_COUNT: Final[int] = 500
|
|
CYTO_MAX_EDGE_COUNT: Final[int] = 800
|
|
CYTO_COLLECTION_NAME: Final[str] = 'lang_main'
|
|
CYTO_BASE_NETWORK_NAME: Final[str] = 'token_graph'
|
|
CYTO_LAYOUT_NAME: Final[CytoLayouts] = 'force-directed'
|
|
CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
|
|
'numIterations': 1000,
|
|
'defaultSpringCoefficient': 1e-4,
|
|
'defaultSpringLength': 45,
|
|
'defaultNodeMass': 11,
|
|
'isDeterministic': True,
|
|
'singlePartition': False,
|
|
}
|
|
CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
|
|
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
|
|
# name for property, on which selection is done
|
|
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
|
|
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
|
|
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
|
|
CYTO_NETWORK_ZOOM_FACTOR: Final[float] = 0.96
|
|
|
|
# ** time_analysis.uniqueness
|
|
threshold_unique_texts: int = CONFIG['time_analysis']['uniqueness']['threshold_unique_texts']
|
|
if threshold_unique_texts < 0:
|
|
threshold_unique_texts = 0
|
|
THRESHOLD_UNIQUE_TEXTS: Final[int] = threshold_unique_texts
|
|
UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
|
'criterion_feature'
|
|
]
|
|
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
|
FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
|
'feature_name_obj_text'
|
|
]
|
|
# ** time_analysis.preparation
|
|
NAME_DELTA_FEAT_TO_REPAIR: Final[str] = CONFIG['time_analysis']['preparation'][
|
|
'name_delta_feat_to_repair'
|
|
]
|
|
NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = CONFIG['time_analysis']['preparation'][
|
|
'name_delta_feat_to_next_failure'
|
|
]
|
|
# ** time_analysis.model_input
|
|
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
|
|
CONFIG['time_analysis']['model_input']['input_features']
|
|
)
|
|
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
|
ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
|
|
CONFIG['time_analysis']['model_input']['activity_types']
|
|
)
|
|
threshold_num_activities: int = CONFIG['time_analysis']['model_input'][
|
|
'threshold_num_activities'
|
|
]
|
|
if threshold_num_activities < 0:
|
|
threshold_num_activities = 0
|
|
THRESHOLD_NUM_ACTIVITIES: Final[int] = threshold_num_activities
|
|
|
|
threshold_timeline_similarity: float = CONFIG['time_analysis']['model_input'][
|
|
'threshold_similarity'
|
|
]
|
|
if threshold_timeline_similarity < 0 or threshold_timeline_similarity > 1:
|
|
raise ValueError(
|
|
(
|
|
'[CONFIG][time_analysis.model_input][threshold_similarity] Timeline similarity '
|
|
'threshold must be between 0 and 1.'
|
|
)
|
|
)
|
|
THRESHOLD_TIMELINE_SIMILARITY: Final[float] = threshold_timeline_similarity
|