diff --git a/src/wattanalyse/pipelines.py b/src/wattanalyse/pipelines.py index e4bcf0b..8b6c5d3 100644 --- a/src/wattanalyse/pipelines.py +++ b/src/wattanalyse/pipelines.py @@ -58,6 +58,18 @@ RENAMING_SCHEME_PSM: dict[str, str] = { PRIM_KEYS: Final[list[str]] = ["PA", "PA_Pos"] +NOT_NULL_COLS: Final[tuple[str, ...]] = ("PA", "PA_Pos", "Meldezeitpunkt_Historie") +NOT_IN_FUTURE_COLS_DATETIME: Final[tuple[str, ...]] = ("Meldezeitpunkt_Historie",) +NOT_IN_FUTURE_COLS_DATE: Final[tuple[str, ...]] = ("Wareneingang am", "Prod-Start_Historie") + +PLAUSI_FEATURES: Final[list[str]] = [ + "Prod-EP10_Historie", + "Prod-EP20_Historie", + "Prod-EP30_Historie", + "Prod-EP40_Historie", + "Prod-EP50_Historie", +] + LOWER_BOUND_DATE_DEVIATION: Final[int] = ( USER_CFG.Datenpipelines_PSM.Terminabweichung_untere_Schranke ) @@ -133,7 +145,6 @@ def preprocess_psm( data = data.drop("null_count") # any NULL values in critical columns - NOT_NULL_COLS = ("PA", "PA_Pos", "Meldezeitpunkt_Historie") conds = [pl.col(col).is_null() for col in NOT_NULL_COLS] filtered_data = pl.concat([filtered_data, data.filter(pl.any_horizontal(*conds))]) data = data.filter(~pl.any_horizontal(*conds)) @@ -142,8 +153,6 @@ def preprocess_psm( # dates not allowed to be in the future current_datetime = datetime.datetime.now() current_date = current_datetime.date() - NOT_IN_FUTURE_COLS_DATETIME = ("Meldezeitpunkt_Historie",) - NOT_IN_FUTURE_COLS_DATE = ("Wareneingang am", "Prod-Start_Historie") conds = [ (pl.col(col) > current_datetime).fill_null(False) for col in NOT_IN_FUTURE_COLS_DATETIME @@ -188,13 +197,6 @@ def process_order_level( data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie"], descending=False) # ** plausibility check of order quantities - PLAUSI_FEATURES: list[str] = [ - "Prod-EP10_Historie", - "Prod-EP20_Historie", - "Prod-EP30_Historie", - "Prod-EP40_Historie", - "Prod-EP50_Historie", - ] data = data.with_columns( pl.all_horizontal( pl.col(PLAUSI_FEATURES).is_null() | (pl.col(PLAUSI_FEATURES) == 0)