diff --git a/lang_main_config.toml b/lang_main_config.toml index 0935eee..c50b611 100644 --- a/lang_main_config.toml +++ b/lang_main_config.toml @@ -1,10 +1,8 @@ -# lang_main: Config file +# d-opt -- lang_main: config file [paths] -inputs = './data/' -# results = './results/dummy_N_1000/' -# dataset = '../data/Dummy_Dataset_N_1000.csv' -results = './data/' +inputs = '../lang-data/in/' +results = '../lang-data/out/' models = '../lang-models' [logging] @@ -28,15 +26,15 @@ date_cols = [ "Arbeitsbeginn", "ErstellungsDatum", ] +target_feature = "VorgangsBeschreibung" threshold_amount_characters = 5 -threshold_similarity = 0.8 +threshold_similarity = 0.92 [graph_postprocessing] -threshold_edge_number = 330 -# threshold_edge_weight = 150 +max_edge_number = -1 [time_analysis.uniqueness] -threshold_unique_texts = 4 +threshold_unique_texts = 5 criterion_feature = 'HObjektText' feature_name_obj_id = 'ObjektID' feature_name_obj_text = 'HObjektText' @@ -46,11 +44,6 @@ name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]' name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]' [time_analysis.model_input] -# input_features = [ -# 'VorgangsTypName', -# 'VorgangsArtText', -# 'VorgangsBeschreibung', -# ] input_features = [ 'VorgangsBeschreibung', ] @@ -59,5 +52,5 @@ activity_types = [ 'Reparaturauftrag (Portal)', 'Störungsmeldung', ] -threshold_num_acitivities = 1 +threshold_num_activities = 1 threshold_similarity = 0.8 \ No newline at end of file diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py index 9dab9ed..b0842d4 100644 --- a/src/lang_main/constants.py +++ b/src/lang_main/constants.py @@ -123,19 +123,29 @@ TAG_OF_INTEREST: frozenset[str] = frozenset() # ** export # ** preprocessing DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols'] -THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][ - 'threshold_amount_characters' -] -THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'] +TARGET_FEATURE: Final[str] = CONFIG['preprocess']['target_feature'] +threshold_amount_characters: int = CONFIG['preprocess']['threshold_amount_characters'] +if threshold_amount_characters < 0: + threshold_amount_characters = 0 +THRESHOLD_AMOUNT_CHARACTERS: Final[int] = threshold_amount_characters +threshold_similarity: float = CONFIG['preprocess']['threshold_similarity'] +if threshold_similarity < 0 or threshold_similarity > 1: + raise ValueError( + ( + '[CONFIG][preprocess][threshold_similarity] Preprocessing similarity ' + 'threshold must be between 0 and 1.' + ) + ) +THRESHOLD_SIMILARITY: Final[float] = threshold_similarity # ** token analysis # ** graph postprocessing -EDGE_WEIGHT_DECIMALS: Final[int] = 4 -threshold_edge_number: int | None = None -cfg_threshold_edge_number: int = CONFIG['graph_postprocessing']['threshold_edge_number'] -if cfg_threshold_edge_number >= 0: - threshold_edge_number = cfg_threshold_edge_number -THRESHOLD_EDGE_NUMBER: Final[int | None] = threshold_edge_number +EDGE_WEIGHT_DECIMALS: Final[int] = 6 +max_edge_number: int | None = None +max_edge_number_cfg: int = CONFIG['graph_postprocessing']['max_edge_number'] +if max_edge_number_cfg >= 0: + max_edge_number = max_edge_number_cfg +MAX_EDGE_NUMBER: Final[int | None] = max_edge_number PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted' PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality' PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance' @@ -163,9 +173,10 @@ CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2 CYTO_NETWORK_ZOOM_FACTOR: Final[float] = 0.96 # ** time_analysis.uniqueness -THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][ - 'threshold_unique_texts' -] +threshold_unique_texts: int = CONFIG['time_analysis']['uniqueness']['threshold_unique_texts'] +if threshold_unique_texts < 0: + threshold_unique_texts = 0 +THRESHOLD_UNIQUE_TEXTS: Final[int] = threshold_unique_texts UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][ 'criterion_feature' ] @@ -174,8 +185,6 @@ FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][ 'feature_name_obj_text' ] # ** time_analysis.preparation -# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair' -CONFIG['time_analysis']['preparation']['name_delta_feat_to_repair'] NAME_DELTA_FEAT_TO_REPAIR: Final[str] = CONFIG['time_analysis']['preparation'][ 'name_delta_feat_to_repair' ] @@ -190,9 +199,21 @@ ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_ ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple( CONFIG['time_analysis']['model_input']['activity_types'] ) -THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][ - 'threshold_num_acitivities' +threshold_num_activities: int = CONFIG['time_analysis']['model_input'][ + 'threshold_num_activities' ] -THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][ +if threshold_num_activities < 0: + threshold_num_activities = 0 +THRESHOLD_NUM_ACTIVITIES: Final[int] = threshold_num_activities + +threshold_timeline_similarity: float = CONFIG['time_analysis']['model_input'][ 'threshold_similarity' ] +if threshold_timeline_similarity < 0 or threshold_timeline_similarity > 1: + raise ValueError( + ( + '[CONFIG][time_analysis.model_input][threshold_similarity] Timeline similarity ' + 'threshold must be between 0 and 1.' + ) + ) +THRESHOLD_TIMELINE_SIMILARITY: Final[float] = threshold_timeline_similarity diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml index ca63d52..278ecf7 100644 --- a/src/lang_main/lang_main_config.toml +++ b/src/lang_main/lang_main_config.toml @@ -1,9 +1,7 @@ -# lang_main: Config file +# d-opt -- lang_main: config file [paths] inputs = '../data/in/' -# results = './results/dummy_N_1000/' -# dataset = '../data/Dummy_Dataset_N_1000.csv' results = '../data/out/' models = './lang-models' @@ -28,15 +26,15 @@ date_cols = [ "Arbeitsbeginn", "ErstellungsDatum", ] +target_feature = "VorgangsBeschreibung" threshold_amount_characters = 5 -threshold_similarity = 0.9 +threshold_similarity = 0.92 [graph_postprocessing] -threshold_edge_number = 330 -# threshold_edge_weight = 150 +max_edge_number = -1 [time_analysis.uniqueness] -threshold_unique_texts = 4 +threshold_unique_texts = 5 criterion_feature = 'HObjektText' feature_name_obj_id = 'ObjektID' feature_name_obj_text = 'HObjektText' @@ -46,11 +44,6 @@ name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]' name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]' [time_analysis.model_input] -# input_features = [ -# 'VorgangsTypName', -# 'VorgangsArtText', -# 'VorgangsBeschreibung', -# ] input_features = [ 'VorgangsBeschreibung', ] @@ -59,5 +52,5 @@ activity_types = [ 'Reparaturauftrag (Portal)', 'Störungsmeldung', ] -threshold_num_acitivities = 1 +threshold_num_activities = 1 threshold_similarity = 0.8 \ No newline at end of file