adaption for deployment structure

This commit is contained in:
Florian Förster 2024-12-11 18:15:07 +01:00
parent 38aa0739ad
commit 123869e203
11 changed files with 106 additions and 110 deletions

0
build.py Normal file
View File

View File

@ -123,6 +123,7 @@ exclude_also = [
[tool.coverage.html]
directory = "reports/coverage"
[dependency-groups]
dev = [
"cython>=3.0.10",

View File

@ -4,7 +4,7 @@ from lang_main.constants import (
INPUT_PATH_FOLDER,
PATH_TO_DATASET,
SAVE_PATH_FOLDER,
input_path_conf,
input_path_cfg,
)
print(SAVE_PATH_FOLDER, '\n')
@ -12,4 +12,4 @@ print(INPUT_PATH_FOLDER, '\n')
print(PATH_TO_DATASET, '\n')
print('------------------------')
print(Path.cwd(), '\n', input_path_conf)
print(Path.cwd(), '\n', input_path_cfg)

View File

@ -6,7 +6,6 @@ from lang_main.config import (
CONFIG_FILENAME,
CYTO_STYLESHEET_FILENAME,
PKG_DIR,
PREFER_INTERNAL_CONFIG,
STOP_FOLDER,
get_config_paths,
load_cfg,
@ -23,13 +22,19 @@ CONFIG: Final[dict[str, Any]] = load_cfg(
starting_path=PKG_DIR,
glob_pattern=CONFIG_FILENAME,
stop_folder_name=STOP_FOLDER,
cfg_path_internal=cfg_path_internal,
prefer_internal_config=PREFER_INTERNAL_CONFIG,
)
base_parent_path = search_base_path(PKG_DIR, stop_folder_name=BASE_FOLDERNAME)
if base_parent_path is None:
raise FileNotFoundError('Could not resolve base path of library')
BASE_PATH: Final[Path] = base_parent_path
lib_path = search_base_path(PKG_DIR, stop_folder_name=STOP_FOLDER)
if lib_path is None:
raise FileNotFoundError('Could not resolve library path of application')
LIB_PATH: Final[Path] = lib_path
print(f'Library path is: {LIB_PATH}', flush=True)
root_path = search_base_path(PKG_DIR, stop_folder_name=BASE_FOLDERNAME)
if root_path is None:
raise FileNotFoundError('Could not resolve root path of application')
ROOT_PATH: Final[Path] = root_path
print(f'Root path is: {ROOT_PATH}', flush=True)
# ** Cytoscape configuration

View File

@ -2,11 +2,13 @@ from __future__ import annotations
import logging
import os
import sys
import tomllib
from pathlib import Path
from typing import Any, Final
from lang_main.errors import LangMainConfigNotFoundError
# from lang_main.loggers import logger_config as logger
from lang_main.search import search_cwd, search_iterative
_has_py4cyto: bool = True
@ -29,10 +31,10 @@ if _has_py4cyto:
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
# ** lang-main config
BASE_FOLDERNAME: Final[str] = 'lang-main'
# ENV variable: LANG_MAIN_BASE_FOLDERNAME
BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang-main')
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
PREFER_INTERNAL_CONFIG: Final[bool] = False
PKG_DIR: Final[Path] = Path(__file__).parent
STOP_FOLDER: Final[str] = 'python'
@ -42,7 +44,7 @@ def load_toml_config(
) -> dict[str, Any]:
with open(path_to_toml, 'rb') as f:
data = tomllib.load(f)
print('Loaded TOML config file successfully.', file=sys.stderr, flush=True)
print('Loaded TOML config file successfully.', flush=True)
return data
@ -63,26 +65,54 @@ def load_cfg(
starting_path: Path,
glob_pattern: str,
stop_folder_name: str | None,
cfg_path_internal: Path,
prefer_internal_config: bool = False,
) -> dict[str, Any]:
cfg_path: Path | None
# look for external config first, if not found use internal one
if prefer_internal_config:
cfg_path = cfg_path_internal
else:
cfg_path = search_cwd(glob_pattern)
"""Look for configuration file. Internal configs are not used any more because
the library behaviour is only guaranteed by external configurations.
if cfg_path is None:
cfg_path = search_iterative(
starting_path=starting_path,
glob_pattern=glob_pattern,
stop_folder_name=stop_folder_name,
)
# backup: use internal config
if cfg_path is None:
cfg_path = cfg_path_internal
Parameters
----------
starting_path : Path
path to start for the lookup
glob_pattern : str
pattern of the config file naming scheme
stop_folder_name : str | None
folder name at which the lookup should stop, the parent folder
is also searched, e.g.
if starting_path is path/to/start/folder and stop_folder_name is 'to',
then path/ is also searched
Returns
-------
dict[str, Any]
loaded config file
Raises
------
LangMainConfigNotFoundError
if no config file was found
"""
cfg_path: Path | None
print('Looking for cfg file in CWD.', flush=True)
cfg_path = search_cwd(glob_pattern)
if cfg_path is None:
print(
(
f'Looking iteratively for config file. Start: {starting_path}, '
f'stop folder: {stop_folder_name}'
),
flush=True,
)
cfg_path = search_iterative(
starting_path=starting_path,
glob_pattern=glob_pattern,
stop_folder_name=stop_folder_name,
)
if cfg_path is None:
raise LangMainConfigNotFoundError('Config file was not found.')
config = load_toml_config(path_to_toml=cfg_path)
print(f'Loaded config from: >>{cfg_path}<<')
return config.copy()

View File

@ -6,7 +6,11 @@ import os
from sentence_transformers import SimilarityFunction
from lang_main import CONFIG, CYTO_PATH_STYLESHEET, BASE_PATH
from lang_main import (
CONFIG,
CYTO_PATH_STYLESHEET,
LIB_PATH,
)
from lang_main.types import (
CytoLayoutProperties,
CytoLayouts,
@ -47,15 +51,18 @@ LOGGING_DEFAULT_GRAPHS: Final[bool] = False
PICKLE_PROTOCOL_VERSION: Final[int] = 5
# ** paths
input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs'])
INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve()
# config placed in library path of application (usually "bin")
input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs'])
INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve()
# TODO reactivate later
# if not INPUT_PATH_FOLDER.exists():
# raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
save_path_conf = Path.cwd() / Path(CONFIG['paths']['results'])
SAVE_PATH_FOLDER: Final[Path] = save_path_conf.resolve()
path_dataset_conf = Path.cwd() / Path(CONFIG['paths']['dataset'])
PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
if not INPUT_PATH_FOLDER.exists():
raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.')
save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results'])
SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve()
if not SAVE_PATH_FOLDER.exists():
raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.')
path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset'])
PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve()
# if not PATH_TO_DATASET.exists():
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
# ** control
@ -69,12 +76,13 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** models
# ** loading
MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models'
MODEL_BASE_FOLDER: Final[Path] = BASE_PATH / MODEL_BASE_FOLDER_NAME
# MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models'
model_folder_cfg = LIB_PATH / Path(CONFIG['paths']['models'])
MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve()
if not MODEL_BASE_FOLDER.exists():
raise FileNotFoundError('Language model folder not found.')
os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_DEP_NEWS_TRF
SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM
STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE

View File

@ -1,4 +1,8 @@
# ** meta exceptions
class LangMainConfigNotFoundError(Exception):
"""Error raised if a config file could not be found successfully"""
class LanguageModelNotFoundError(Exception):
"""Error raised if a given language model could not be loaded successfully"""

View File

@ -3,11 +3,12 @@
pkg = 'lang_main_internal'
[paths]
inputs = './inputs/'
inputs = './data/in/'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './results/test_20240807/'
results = './data/out/'
dataset = '../data/02_202307/Export4.csv'
models = '../../lang-models'
[logging]
enabled = true
@ -17,11 +18,11 @@ file = true
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
preprocessing_skip = false
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
graph_static_rendering_skip = true
time_analysis_skip = true
[preprocess]

View File

@ -1,57 +0,0 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = false
token_analysis_skip = false
graph_postprocessing_skip = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -4,8 +4,8 @@ from pathlib import Path
from time import gmtime
from typing import Final
from lang_main import LIB_PATH
from lang_main.constants import (
BASE_PATH,
ENABLE_LOGGING,
LOGGING_TO_FILE,
LOGGING_TO_STDERR,
@ -16,11 +16,13 @@ from lang_main.types import LoggingLevels
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
LOG_FILE_FOLDER: Final[Path] = BASE_PATH / 'logs'
LOG_FILE_FOLDER: Final[Path] = LIB_PATH / 'logs'
if not LOG_FILE_FOLDER.exists():
LOG_FILE_FOLDER.mkdir(parents=True)
LOG_FILE_PATH: Final[Path] = LOG_FILE_FOLDER / 'lang-main.log'
LOGGING_LEVEL_STDERR: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_FILE: Final[LoggingLevels] = LoggingLevels.DEBUG
# ** formatters
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
@ -29,7 +31,7 @@ logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
null_handler = logging.NullHandler()
if ENABLE_LOGGING and LOGGING_TO_STDERR:
logger_all_handler_stderr = logging.StreamHandler()
logger_all_handler_stderr.setLevel(LoggingLevels.WARNING)
logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR)
logger_all_handler_stderr.setFormatter(logger_all_formater)
else:
logger_all_handler_stderr = null_handler
@ -41,14 +43,13 @@ if ENABLE_LOGGING and LOGGING_TO_FILE:
maxBytes=5_242_880,
backupCount=1,
)
logger_all_handler_file.setLevel(LoggingLevels.DEBUG)
logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE)
logger_all_handler_file.setFormatter(logger_all_formater)
else:
logger_all_handler_file = null_handler
# ** logging levels
LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG
@ -56,12 +57,15 @@ LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_CONFIG: Final[LoggingLevels] = LoggingLevels.DEBUG
# ** loggers and configuration
logger_all = logging.getLogger('lang_main')
logger_all.addHandler(logger_all_handler_stderr)
logger_all.addHandler(logger_all_handler_file)
logger_config = logging.getLogger('lang_main.config')
logger_config.setLevel(LOGGING_LEVEL_CONFIG)
logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing')

View File

@ -1,5 +1,5 @@
from lang_main import BASE_PATH
from lang_main import ROOT_PATH
def test_base_path():
assert BASE_PATH is not None
assert ROOT_PATH is not None