from enum import Enum # noqa: I001 from importlib.util import find_spec from pathlib import Path from typing import Final, cast import os from sentence_transformers import SimilarityFunction from lang_main import ( CONFIG, CYTO_PATH_STYLESHEET, LIB_PATH, ) from lang_main.types import ( CytoLayoutProperties, CytoLayouts, ONNXExecutionProvider, # noqa: F401 STFRBackends, STFRDeviceTypes, STFRModelArgs, STFRModelTypes, STFRONNXFilenames, # noqa: F401 SpacyModelTypes, ) __all__ = [ 'CONFIG', 'CYTO_PATH_STYLESHEET', ] # ** dependencies _has_py4cyto: bool = True if find_spec('py4cytoscape') else False _has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False _has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False class Dependencies(Enum): PY4C = _has_py4cyto DASH = _has_dash PLOT = _has_plotly # ** logging # graphs ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled'] LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file'] LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr'] LOGGING_DEFAULT_GRAPHS: Final[bool] = False # ** pickling PICKLE_PROTOCOL_VERSION: Final[int] = 5 # ** paths # config placed in library path of application (usually "bin") input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs']) INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve() if not INPUT_PATH_FOLDER.exists(): # pragma: no cover raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.') save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results']) SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve() if not SAVE_PATH_FOLDER.exists(): # pragma: no cover raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.') TK_GRAPH_EXPORT_FILENAME: Final[str] = CONFIG['paths']['graph_export_filename'] # ** control SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip'] SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip'] SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip'] SKIP_GRAPH_RESCALING: Final[bool] = CONFIG['control']['graph_rescaling_skip'] SKIP_GRAPH_STATIC_RENDERING: Final[bool] = CONFIG['control']['graph_static_rendering_skip'] SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip'] # ** models # ** loading model_folder_cfg = LIB_PATH / Path(CONFIG['paths']['models']) MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve() if not MODEL_BASE_FOLDER.exists(): raise FileNotFoundError('Language model folder not found.') os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER) # LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found) # LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use # LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use # LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx" # config option for switch between spaCy TRF und medium model # env variable has prio: if set, use this SPACY_USE_LARGE_MODEL: Final[bool] = CONFIG['models']['use_large_model'] spacy_model_name: str | SpacyModelTypes | None spacy_model_name = os.environ.get('LANG_MAIN_SPACY_MODEL', None) if spacy_model_name is None: if SPACY_USE_LARGE_MODEL: spacy_model_name = SpacyModelTypes.DE_DEP_NEWS_TRF else: spacy_model_name = SpacyModelTypes.DE_CORE_NEWS_MD SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = spacy_model_name STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get( 'LANG_MAIN_STFR_MODEL', STFRModelTypes.E5_BASE_STS_EN_DE ) STFR_CUSTOM_MODELS: Final[dict[tuple[STFRModelTypes, STFRBackends], bool]] = { (STFRModelTypes.E5_BASE_STS_EN_DE, STFRBackends.ONNX): True, } STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE STFR_BACKEND: Final[STFRBackends] = cast( STFRBackends, os.environ.get('LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH) ) stfr_model_args_default: STFRModelArgs = {'torch_dtype': 'float32'} stfr_model_args_onnx: STFRModelArgs = { 'file_name': STFRONNXFilenames.ONNX_Q_UINT8, 'provider': ONNXExecutionProvider.CPU, 'export': False, } stfr_model_args: STFRModelArgs if STFR_BACKEND == STFRBackends.ONNX: stfr_model_args = stfr_model_args_onnx else: stfr_model_args = stfr_model_args_default STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args # ** language dependency analysis # ** POS POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV', 'NUM']) POS_INDIRECT: frozenset[str] = frozenset(['AUX']) # ** TAG # TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD']) TAG_OF_INTEREST: frozenset[str] = frozenset() # ** export # ** preprocessing DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols'] TARGET_FEATURE: Final[str] = CONFIG['preprocess']['target_feature'] threshold_amount_characters: int = CONFIG['preprocess']['threshold_amount_characters'] if threshold_amount_characters < 0: threshold_amount_characters = 0 THRESHOLD_AMOUNT_CHARACTERS: Final[int] = threshold_amount_characters threshold_similarity: float = CONFIG['preprocess']['threshold_similarity'] if threshold_similarity < 0 or threshold_similarity > 1: raise ValueError( ( '[CONFIG][preprocess][threshold_similarity] Preprocessing similarity ' 'threshold must be between 0 and 1.' ) ) THRESHOLD_SIMILARITY: Final[float] = threshold_similarity # ** token analysis # ** graph postprocessing ENABLE_EDGE_RESCALING: Final[bool] = CONFIG['graph_postprocessing']['enable_edge_rescaling'] EDGE_WEIGHT_DECIMALS: Final[int] = 6 max_edge_number: int | None = None max_edge_number_cfg: int = CONFIG['graph_postprocessing']['max_edge_number'] if max_edge_number_cfg >= 0: max_edge_number = max_edge_number_cfg MAX_EDGE_NUMBER: Final[int | None] = max_edge_number PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted' PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality' PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance' # ** graph exports (Cytoscape) CYTO_MAX_NODE_COUNT: Final[int] = 500 CYTO_MAX_EDGE_COUNT: Final[int] = 800 CYTO_COLLECTION_NAME: Final[str] = 'lang_main' CYTO_BASE_NETWORK_NAME: Final[str] = 'token_graph' CYTO_LAYOUT_NAME: Final[CytoLayouts] = 'force-directed' CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = { 'numIterations': 1000, 'defaultSpringCoefficient': 1e-4, 'defaultSpringLength': 45, 'defaultNodeMass': 11, 'isDeterministic': True, 'singlePartition': False, } CYTO_SANDBOX_NAME: Final[str] = 'lang_main' CYTO_STYLESHEET_NAME: Final[str] = 'lang_main' # name for property, on which selection is done CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection' CYTO_NUMBER_SUBGRAPHS: Final[int] = 5 CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2 CYTO_NETWORK_ZOOM_FACTOR: Final[float] = 0.96 # ** time_analysis.uniqueness threshold_unique_texts: int = CONFIG['time_analysis']['uniqueness']['threshold_unique_texts'] if threshold_unique_texts < 0: threshold_unique_texts = 0 THRESHOLD_UNIQUE_TEXTS: Final[int] = threshold_unique_texts UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][ 'criterion_feature' ] FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id'] FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][ 'feature_name_obj_text' ] # ** time_analysis.preparation NAME_DELTA_FEAT_TO_REPAIR: Final[str] = CONFIG['time_analysis']['preparation'][ 'name_delta_feat_to_repair' ] NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = CONFIG['time_analysis']['preparation'][ 'name_delta_feat_to_next_failure' ] # ** time_analysis.model_input MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple( CONFIG['time_analysis']['model_input']['input_features'] ) ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature'] ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple( CONFIG['time_analysis']['model_input']['activity_types'] ) threshold_num_activities: int = CONFIG['time_analysis']['model_input'][ 'threshold_num_activities' ] if threshold_num_activities < 0: threshold_num_activities = 0 THRESHOLD_NUM_ACTIVITIES: Final[int] = threshold_num_activities threshold_timeline_similarity: float = CONFIG['time_analysis']['model_input'][ 'threshold_similarity' ] if threshold_timeline_similarity < 0 or threshold_timeline_similarity > 1: raise ValueError( ( '[CONFIG][time_analysis.model_input][threshold_similarity] Timeline similarity ' 'threshold must be between 0 and 1.' ) ) THRESHOLD_TIMELINE_SIMILARITY: Final[float] = threshold_timeline_similarity