diff --git a/build.py b/build.py new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml index 14766f6..634c110 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,6 +123,7 @@ exclude_also = [ [tool.coverage.html] directory = "reports/coverage" + [dependency-groups] dev = [ "cython>=3.0.10", diff --git a/scripts/pre_test_examples.py b/scripts/pre_test_examples.py index 5416035..ca3f76e 100644 --- a/scripts/pre_test_examples.py +++ b/scripts/pre_test_examples.py @@ -4,7 +4,7 @@ from lang_main.constants import ( INPUT_PATH_FOLDER, PATH_TO_DATASET, SAVE_PATH_FOLDER, - input_path_conf, + input_path_cfg, ) print(SAVE_PATH_FOLDER, '\n') @@ -12,4 +12,4 @@ print(INPUT_PATH_FOLDER, '\n') print(PATH_TO_DATASET, '\n') print('------------------------') -print(Path.cwd(), '\n', input_path_conf) +print(Path.cwd(), '\n', input_path_cfg) diff --git a/src/lang_main/__init__.py b/src/lang_main/__init__.py index 0fd67a0..90e2f84 100644 --- a/src/lang_main/__init__.py +++ b/src/lang_main/__init__.py @@ -6,7 +6,6 @@ from lang_main.config import ( CONFIG_FILENAME, CYTO_STYLESHEET_FILENAME, PKG_DIR, - PREFER_INTERNAL_CONFIG, STOP_FOLDER, get_config_paths, load_cfg, @@ -23,13 +22,19 @@ CONFIG: Final[dict[str, Any]] = load_cfg( starting_path=PKG_DIR, glob_pattern=CONFIG_FILENAME, stop_folder_name=STOP_FOLDER, - cfg_path_internal=cfg_path_internal, - prefer_internal_config=PREFER_INTERNAL_CONFIG, ) -base_parent_path = search_base_path(PKG_DIR, stop_folder_name=BASE_FOLDERNAME) -if base_parent_path is None: - raise FileNotFoundError('Could not resolve base path of library') -BASE_PATH: Final[Path] = base_parent_path + +lib_path = search_base_path(PKG_DIR, stop_folder_name=STOP_FOLDER) +if lib_path is None: + raise FileNotFoundError('Could not resolve library path of application') +LIB_PATH: Final[Path] = lib_path +print(f'Library path is: {LIB_PATH}', flush=True) + +root_path = search_base_path(PKG_DIR, stop_folder_name=BASE_FOLDERNAME) +if root_path is None: + raise FileNotFoundError('Could not resolve root path of application') +ROOT_PATH: Final[Path] = root_path +print(f'Root path is: {ROOT_PATH}', flush=True) # ** Cytoscape configuration diff --git a/src/lang_main/config.py b/src/lang_main/config.py index afcb819..845d1cd 100644 --- a/src/lang_main/config.py +++ b/src/lang_main/config.py @@ -2,11 +2,13 @@ from __future__ import annotations import logging import os -import sys import tomllib from pathlib import Path from typing import Any, Final +from lang_main.errors import LangMainConfigNotFoundError + +# from lang_main.loggers import logger_config as logger from lang_main.search import search_cwd, search_iterative _has_py4cyto: bool = True @@ -29,10 +31,10 @@ if _has_py4cyto: p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler()) # ** lang-main config -BASE_FOLDERNAME: Final[str] = 'lang-main' +# ENV variable: LANG_MAIN_BASE_FOLDERNAME +BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang-main') CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml' -PREFER_INTERNAL_CONFIG: Final[bool] = False PKG_DIR: Final[Path] = Path(__file__).parent STOP_FOLDER: Final[str] = 'python' @@ -42,7 +44,7 @@ def load_toml_config( ) -> dict[str, Any]: with open(path_to_toml, 'rb') as f: data = tomllib.load(f) - print('Loaded TOML config file successfully.', file=sys.stderr, flush=True) + print('Loaded TOML config file successfully.', flush=True) return data @@ -63,26 +65,54 @@ def load_cfg( starting_path: Path, glob_pattern: str, stop_folder_name: str | None, - cfg_path_internal: Path, - prefer_internal_config: bool = False, ) -> dict[str, Any]: - cfg_path: Path | None - # look for external config first, if not found use internal one - if prefer_internal_config: - cfg_path = cfg_path_internal - else: - cfg_path = search_cwd(glob_pattern) + """Look for configuration file. Internal configs are not used any more because + the library behaviour is only guaranteed by external configurations. - if cfg_path is None: - cfg_path = search_iterative( - starting_path=starting_path, - glob_pattern=glob_pattern, - stop_folder_name=stop_folder_name, - ) - # backup: use internal config - if cfg_path is None: - cfg_path = cfg_path_internal + Parameters + ---------- + starting_path : Path + path to start for the lookup + glob_pattern : str + pattern of the config file naming scheme + stop_folder_name : str | None + folder name at which the lookup should stop, the parent folder + is also searched, e.g. + if starting_path is path/to/start/folder and stop_folder_name is 'to', + then path/ is also searched + + Returns + ------- + dict[str, Any] + loaded config file + + Raises + ------ + LangMainConfigNotFoundError + if no config file was found + """ + cfg_path: Path | None + print('Looking for cfg file in CWD.', flush=True) + cfg_path = search_cwd(glob_pattern) + + if cfg_path is None: + print( + ( + f'Looking iteratively for config file. Start: {starting_path}, ' + f'stop folder: {stop_folder_name}' + ), + flush=True, + ) + cfg_path = search_iterative( + starting_path=starting_path, + glob_pattern=glob_pattern, + stop_folder_name=stop_folder_name, + ) + + if cfg_path is None: + raise LangMainConfigNotFoundError('Config file was not found.') config = load_toml_config(path_to_toml=cfg_path) + print(f'Loaded config from: >>{cfg_path}<<') return config.copy() diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py index 1f604a8..5ce4001 100644 --- a/src/lang_main/constants.py +++ b/src/lang_main/constants.py @@ -6,7 +6,11 @@ import os from sentence_transformers import SimilarityFunction -from lang_main import CONFIG, CYTO_PATH_STYLESHEET, BASE_PATH +from lang_main import ( + CONFIG, + CYTO_PATH_STYLESHEET, + LIB_PATH, +) from lang_main.types import ( CytoLayoutProperties, CytoLayouts, @@ -47,15 +51,18 @@ LOGGING_DEFAULT_GRAPHS: Final[bool] = False PICKLE_PROTOCOL_VERSION: Final[int] = 5 # ** paths -input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs']) -INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve() +# config placed in library path of application (usually "bin") +input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs']) +INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve() # TODO reactivate later -# if not INPUT_PATH_FOLDER.exists(): -# raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.') -save_path_conf = Path.cwd() / Path(CONFIG['paths']['results']) -SAVE_PATH_FOLDER: Final[Path] = save_path_conf.resolve() -path_dataset_conf = Path.cwd() / Path(CONFIG['paths']['dataset']) -PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve() +if not INPUT_PATH_FOLDER.exists(): + raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.') +save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results']) +SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve() +if not SAVE_PATH_FOLDER.exists(): + raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.') +path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset']) +PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve() # if not PATH_TO_DATASET.exists(): # raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.') # ** control @@ -69,12 +76,13 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip'] # ** models # ** loading -MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models' -MODEL_BASE_FOLDER: Final[Path] = BASE_PATH / MODEL_BASE_FOLDER_NAME +# MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models' +model_folder_cfg = LIB_PATH / Path(CONFIG['paths']['models']) +MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve() if not MODEL_BASE_FOLDER.exists(): raise FileNotFoundError('Language model folder not found.') os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER) -SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_DEP_NEWS_TRF +SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2 STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE diff --git a/src/lang_main/errors.py b/src/lang_main/errors.py index 36625db..c544674 100644 --- a/src/lang_main/errors.py +++ b/src/lang_main/errors.py @@ -1,4 +1,8 @@ # ** meta exceptions +class LangMainConfigNotFoundError(Exception): + """Error raised if a config file could not be found successfully""" + + class LanguageModelNotFoundError(Exception): """Error raised if a given language model could not be loaded successfully""" diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml index 729e8b8..b77c206 100644 --- a/src/lang_main/lang_main_config.toml +++ b/src/lang_main/lang_main_config.toml @@ -3,11 +3,12 @@ pkg = 'lang_main_internal' [paths] -inputs = './inputs/' +inputs = './data/in/' # results = './results/dummy_N_1000/' # dataset = '../data/Dummy_Dataset_N_1000.csv' -results = './results/test_20240807/' +results = './data/out/' dataset = '../data/02_202307/Export4.csv' +models = '../../lang-models' [logging] enabled = true @@ -17,11 +18,11 @@ file = true # only debugging features, production-ready pipelines should always # be fully executed [control] -preprocessing_skip = true +preprocessing_skip = false token_analysis_skip = false graph_postprocessing_skip = false graph_rescaling_skip = false -graph_static_rendering_skip = false +graph_static_rendering_skip = true time_analysis_skip = true [preprocess] diff --git a/src/lang_main/lang_main_config_old.toml b/src/lang_main/lang_main_config_old.toml deleted file mode 100644 index ef7dbbc..0000000 --- a/src/lang_main/lang_main_config_old.toml +++ /dev/null @@ -1,57 +0,0 @@ -# lang_main: Config file - -[paths] -inputs = './inputs/' -results = './results/test_new2/' -dataset = './01_2_Rohdaten_neu/Export4.csv' -#results = './results/Export7/' -#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv' -#results = './results/Export7_trunc/' -#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv' - -# only debugging features, production-ready pipelines should always -# be fully executed -[control] -preprocessing_skip = false -token_analysis_skip = false -graph_postprocessing_skip = false -time_analysis_skip = false - -#[export_filenames] -#filename_cossim_filter_candidates = 'CosSim-FilterCandidates' - -[preprocess] -filename_cossim_filter_candidates = 'CosSim-FilterCandidates' -date_cols = [ - "VorgangsDatum", - "ErledigungsDatum", - "Arbeitsbeginn", - "ErstellungsDatum", -] -threshold_amount_characters = 5 -threshold_similarity = 0.8 - -[graph_postprocessing] -threshold_edge_weight = 150 - -[time_analysis.uniqueness] -threshold_unique_texts = 4 -criterion_feature = 'HObjektText' -feature_name_obj_id = 'ObjektID' - -[time_analysis.model_input] -# input_features = [ -# 'VorgangsTypName', -# 'VorgangsArtText', -# 'VorgangsBeschreibung', -# ] -input_features = [ - 'VorgangsBeschreibung', -] -activity_feature = 'VorgangsTypName' -activity_types = [ - 'Reparaturauftrag (Portal)', - 'Störungsmeldung', -] -threshold_num_acitivities = 1 -threshold_similarity = 0.8 \ No newline at end of file diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py index 739a0a5..35a575b 100644 --- a/src/lang_main/loggers.py +++ b/src/lang_main/loggers.py @@ -4,8 +4,8 @@ from pathlib import Path from time import gmtime from typing import Final +from lang_main import LIB_PATH from lang_main.constants import ( - BASE_PATH, ENABLE_LOGGING, LOGGING_TO_FILE, LOGGING_TO_STDERR, @@ -16,11 +16,13 @@ from lang_main.types import LoggingLevels logging.Formatter.converter = gmtime LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s' LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000' -LOG_FILE_FOLDER: Final[Path] = BASE_PATH / 'logs' +LOG_FILE_FOLDER: Final[Path] = LIB_PATH / 'logs' if not LOG_FILE_FOLDER.exists(): LOG_FILE_FOLDER.mkdir(parents=True) LOG_FILE_PATH: Final[Path] = LOG_FILE_FOLDER / 'lang-main.log' +LOGGING_LEVEL_STDERR: Final[LoggingLevels] = LoggingLevels.INFO +LOGGING_LEVEL_FILE: Final[LoggingLevels] = LoggingLevels.DEBUG # ** formatters logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT) @@ -29,7 +31,7 @@ logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT) null_handler = logging.NullHandler() if ENABLE_LOGGING and LOGGING_TO_STDERR: logger_all_handler_stderr = logging.StreamHandler() - logger_all_handler_stderr.setLevel(LoggingLevels.WARNING) + logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR) logger_all_handler_stderr.setFormatter(logger_all_formater) else: logger_all_handler_stderr = null_handler @@ -41,14 +43,13 @@ if ENABLE_LOGGING and LOGGING_TO_FILE: maxBytes=5_242_880, backupCount=1, ) - logger_all_handler_file.setLevel(LoggingLevels.DEBUG) + logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE) logger_all_handler_file.setFormatter(logger_all_formater) else: logger_all_handler_file = null_handler # ** logging levels -LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG @@ -56,12 +57,15 @@ LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG +LOGGING_LEVEL_CONFIG: Final[LoggingLevels] = LoggingLevels.DEBUG # ** loggers and configuration logger_all = logging.getLogger('lang_main') logger_all.addHandler(logger_all_handler_stderr) logger_all.addHandler(logger_all_handler_file) +logger_config = logging.getLogger('lang_main.config') +logger_config.setLevel(LOGGING_LEVEL_CONFIG) logger_shared_helpers = logging.getLogger('lang_main.shared') logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS) logger_preprocess = logging.getLogger('lang_main.analysis.preprocessing') diff --git a/tests/test_lang_main_init.py b/tests/test_lang_main_init.py index 7456053..474088f 100644 --- a/tests/test_lang_main_init.py +++ b/tests/test_lang_main_init.py @@ -1,5 +1,5 @@ -from lang_main import BASE_PATH +from lang_main import ROOT_PATH def test_base_path(): - assert BASE_PATH is not None + assert ROOT_PATH is not None