update config usage

2025-01-22 16:53:30 +01:00 · 2025-01-22 16:53:30 +01:00 · 27445a679b
commit 27445a679b
parent 0e36e78906
3 changed files with 53 additions and 46 deletions
--- a/lang_main_config.toml
+++ b/lang_main_config.toml
@ -1,10 +1,8 @@
-# lang_main: Config file
+# d-opt -- lang_main: config file

 [paths]
-inputs = './data/'
-# results = './results/dummy_N_1000/'
-# dataset = '../data/Dummy_Dataset_N_1000.csv'
-results = './data/'
+inputs = '../lang-data/in/'
+results = '../lang-data/out/'
 models = '../lang-models'

 [logging]
@ -28,15 +26,15 @@ date_cols = [
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
+target_feature = "VorgangsBeschreibung"
 threshold_amount_characters = 5
-threshold_similarity = 0.8
+threshold_similarity = 0.92

 [graph_postprocessing]
-threshold_edge_number = 330
-# threshold_edge_weight = 150
+max_edge_number = -1

 [time_analysis.uniqueness]
-threshold_unique_texts = 4
+threshold_unique_texts = 5
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 feature_name_obj_text = 'HObjektText'
@ -46,11 +44,6 @@ name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'

 [time_analysis.model_input]
-# input_features = [
-#     'VorgangsTypName',
-#     'VorgangsArtText',
-#     'VorgangsBeschreibung',
-# ]
 input_features = [
    'VorgangsBeschreibung',
 ]
@ -59,5 +52,5 @@ activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
-threshold_num_acitivities = 1
+threshold_num_activities = 1
 threshold_similarity = 0.8
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -123,19 +123,29 @@ TAG_OF_INTEREST: frozenset[str] = frozenset()
 # ** export
 # ** preprocessing
 DATE_COLS: Final[list[str]] = CONFIG['preprocess']['date_cols']
-THRESHOLD_AMOUNT_CHARACTERS: Final[float] = CONFIG['preprocess'][
-    'threshold_amount_characters'
-]
-THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity']
+TARGET_FEATURE: Final[str] = CONFIG['preprocess']['target_feature']
+threshold_amount_characters: int = CONFIG['preprocess']['threshold_amount_characters']
+if threshold_amount_characters < 0:
+    threshold_amount_characters = 0
+THRESHOLD_AMOUNT_CHARACTERS: Final[int] = threshold_amount_characters
+threshold_similarity: float = CONFIG['preprocess']['threshold_similarity']
+if threshold_similarity < 0 or threshold_similarity > 1:
+    raise ValueError(
+        (
+            '[CONFIG][preprocess][threshold_similarity] Preprocessing similarity '
+            'threshold must be between 0 and 1.'
+        )
+    )
+THRESHOLD_SIMILARITY: Final[float] = threshold_similarity
 # ** token analysis

 # ** graph postprocessing
-EDGE_WEIGHT_DECIMALS: Final[int] = 4
-threshold_edge_number: int | None = None
-cfg_threshold_edge_number: int = CONFIG['graph_postprocessing']['threshold_edge_number']
-if cfg_threshold_edge_number >= 0:
-    threshold_edge_number = cfg_threshold_edge_number
-THRESHOLD_EDGE_NUMBER: Final[int | None] = threshold_edge_number
+EDGE_WEIGHT_DECIMALS: Final[int] = 6
+max_edge_number: int | None = None
+max_edge_number_cfg: int = CONFIG['graph_postprocessing']['max_edge_number']
+if max_edge_number_cfg >= 0:
+    max_edge_number = max_edge_number_cfg
+MAX_EDGE_NUMBER: Final[int | None] = max_edge_number
 PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
 PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality'
 PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance'
@ -163,9 +173,10 @@ CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
 CYTO_NETWORK_ZOOM_FACTOR: Final[float] = 0.96

 # ** time_analysis.uniqueness
-THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
-    'threshold_unique_texts'
-]
+threshold_unique_texts: int = CONFIG['time_analysis']['uniqueness']['threshold_unique_texts']
+if threshold_unique_texts < 0:
+    threshold_unique_texts = 0
+THRESHOLD_UNIQUE_TEXTS: Final[int] = threshold_unique_texts
 UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
    'criterion_feature'
 ]
@ -174,8 +185,6 @@ FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][
    'feature_name_obj_text'
 ]
 # ** time_analysis.preparation
-# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
-CONFIG['time_analysis']['preparation']['name_delta_feat_to_repair']
 NAME_DELTA_FEAT_TO_REPAIR: Final[str] = CONFIG['time_analysis']['preparation'][
    'name_delta_feat_to_repair'
 ]
@ -190,9 +199,21 @@ ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_
 ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
    CONFIG['time_analysis']['model_input']['activity_types']
 )
-THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
-    'threshold_num_acitivities'
+threshold_num_activities: int = CONFIG['time_analysis']['model_input'][
+    'threshold_num_activities'
 ]
-THRESHOLD_TIMELINE_SIMILARITY: Final[float] = CONFIG['time_analysis']['model_input'][
+if threshold_num_activities < 0:
+    threshold_num_activities = 0
+THRESHOLD_NUM_ACTIVITIES: Final[int] = threshold_num_activities
+
+threshold_timeline_similarity: float = CONFIG['time_analysis']['model_input'][
    'threshold_similarity'
 ]
+if threshold_timeline_similarity < 0 or threshold_timeline_similarity > 1:
+    raise ValueError(
+        (
+            '[CONFIG][time_analysis.model_input][threshold_similarity] Timeline similarity '
+            'threshold must be between 0 and 1.'
+        )
+    )
+THRESHOLD_TIMELINE_SIMILARITY: Final[float] = threshold_timeline_similarity
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -1,9 +1,7 @@
-# lang_main: Config file
+# d-opt -- lang_main: config file

 [paths]
 inputs = '../data/in/'
-# results = './results/dummy_N_1000/'
-# dataset = '../data/Dummy_Dataset_N_1000.csv'
 results = '../data/out/'
 models = './lang-models'

@ -28,15 +26,15 @@ date_cols = [
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
+target_feature = "VorgangsBeschreibung"
 threshold_amount_characters = 5
-threshold_similarity = 0.9
+threshold_similarity = 0.92

 [graph_postprocessing]
-threshold_edge_number = 330
-# threshold_edge_weight = 150
+max_edge_number = -1

 [time_analysis.uniqueness]
-threshold_unique_texts = 4
+threshold_unique_texts = 5
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 feature_name_obj_text = 'HObjektText'
@ -46,11 +44,6 @@ name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'

 [time_analysis.model_input]
-# input_features = [
-#     'VorgangsTypName',
-#     'VorgangsArtText',
-#     'VorgangsBeschreibung',
-# ]
 input_features = [
    'VorgangsBeschreibung',
 ]
@ -59,5 +52,5 @@ activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
-threshold_num_acitivities = 1
+threshold_num_activities = 1
 threshold_similarity = 0.8