successful saving/loading of production order aggregate table

2026-06-05 17:35:01 +02:00
parent 9c8b4ea48c
commit 53df924bcb
3 changed files with 175 additions and 134 deletions
--- a/prototypes/external_code.py
+++ b/prototypes/external_code.py
@@ -20,7 +20,7 @@ from wattanalyse import db

@dc.dataclass(slots=True, eq=False)
 class PreProcessResult:
-    data: pl.DataFrame
+    data: pl.LazyFrame
    filtered: pl.DataFrame


@@ -58,7 +58,7 @@ NUMBER_YEARS_UPPER_BOUND_DATES: Final[int] = 4

 # // (1) preprocess
 def preprocess_psm(
-    data: pl.DataFrame,
+    data: pl.LazyFrame,
 ) -> PreProcessResult:
    data = data.rename(RENAMING_SCHEME)
    REGEX_PATTERN = r"^[\s\-#+/$]+$"
@@ -69,7 +69,23 @@ def preprocess_psm(
        .name.keep()
    )
    data = data.with_columns(pl.col("Konfektionär").str.strip_chars(" \n\t"))
-    filtered_data = pl.DataFrame(schema=data.schema)
+    filtered_data = pl.LazyFrame(schema=data.collect_schema())
+
+    # drop duplicates
+    # use null count as information measure, least amount of nulls should be contained
+    base_columns = data.columns
+    data = data.with_columns(pl.sum_horizontal(pl.all().is_null()).alias("null_count"))
+    data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie", "null_count"], descending=False)
+    filtered_data = pl.concat(
+        [
+            filtered_data,
+            data.filter(
+                ~pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct()
+            ).select(base_columns),
+        ]
+    )
+    data = data.filter(pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct())
+    data = data.drop("null_count")

    # any NULL values in critical columns
    NOT_NULL_COLS = ("PA", "PA_Pos", "Meldezeitpunkt_Historie")
@@ -115,11 +131,13 @@ def preprocess_psm(
    filtered_data = pl.concat([filtered_data, data.filter(pl.any_horizontal(cond))])
    data = data.filter(~pl.any_horizontal(cond))

-    return PreProcessResult(data=data, filtered=filtered_data)
+    return PreProcessResult(data=data, filtered=filtered_data.collect())


 # // (2) process on order level
-def process_order_level(data: pl.DataFrame) -> pl.DataFrame:
+def process_order_level(
+    data: pl.LazyFrame,
+) -> pl.LazyFrame:
    # ** renaming
    # data = data.rename(RENAMING_SCHEME) # TODO delete, done in pre-processing
    data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie"], descending=False)
@@ -255,42 +273,67 @@ def process_order_level(data: pl.DataFrame) -> pl.DataFrame:
    # ** order specific aggregates
    data = (
        data.with_columns(
-            pl.when(
-                (pl.col("Liefertermin_Ist").is_not_null())
-                & (pl.col("Liefertermin_Soll").is_not_null())
-            )
-            .then((pl.col("Liefertermin_Ist") - pl.col("Liefertermin_Soll")).dt.total_days())
-            .otherwise(None)
+            (pl.col("Liefertermin_Ist") - pl.col("Liefertermin_Soll"))
+            .dt.total_days()
            .alias("Terminabweichung_Anzahl_Tage")
        )
        .with_columns(
-            pl.when(pl.col("Terminabweichung_Anzahl_Tage") < LOWER_BOUND_DATE_DEVIATION)
-            .then(pl.lit(True))
-            .otherwise(pl.lit(False))
-            .alias("Terminunterschreitung"),
-            pl.when(pl.col("Terminabweichung_Anzahl_Tage") > UPPER_BOUND_DATE_DEVIATION)
-            .then(pl.lit(True))
-            .otherwise(pl.lit(False))
-            .alias("Terminüberschreitung"),
-            pl.when(
-                (pl.col("Liefertermin_Ist").is_not_null())
-                & (pl.col("Prod-Start").is_not_null())
-            )
-            .then((pl.col("Liefertermin_Ist") - pl.col("Prod-Start")).dt.total_days())
-            .otherwise(None)
+            (pl.col("Terminabweichung_Anzahl_Tage") < LOWER_BOUND_DATE_DEVIATION).alias(
+                "Terminunterschreitung"
+            ),
+            (pl.col("Terminabweichung_Anzahl_Tage") > UPPER_BOUND_DATE_DEVIATION).alias(
+                "Terminüberschreitung"
+            ),
+            (pl.col("Liefertermin_Ist") - pl.col("Prod-Start"))
+            .dt.total_days()
            .alias("Durchlaufzeit_Anzahl_Tage"),
        )
        .with_columns(
-            pl.when(
-                (pl.col("Durchlaufzeit_Anzahl_Tage").is_not_null())
-                & (pl.col("Durchlaufzeit_Anzahl_Tage") < 0)
-            )
+            pl.when(pl.col("Durchlaufzeit_Anzahl_Tage") < 0)
            .then(None)
            .otherwise(pl.col("Durchlaufzeit_Anzahl_Tage"))
            .alias("Durchlaufzeit_Anzahl_Tage")
        )
    )

+    # data = (
+    #     data.with_columns(
+    #         pl.when(
+    #             (pl.col("Liefertermin_Ist").is_not_null())
+    #             & (pl.col("Liefertermin_Soll").is_not_null())
+    #         )
+    #         .then((pl.col("Liefertermin_Ist") - pl.col("Liefertermin_Soll")).dt.total_days())
+    #         .otherwise(None)
+    #         .alias("Terminabweichung_Anzahl_Tage")
+    #     )
+    #     .with_columns(
+    #         pl.when(pl.col("Terminabweichung_Anzahl_Tage") < LOWER_BOUND_DATE_DEVIATION)
+    #         .then(pl.lit(True))
+    #         .otherwise(pl.lit(False))
+    #         .alias("Terminunterschreitung"),
+    #         pl.when(pl.col("Terminabweichung_Anzahl_Tage") > UPPER_BOUND_DATE_DEVIATION)
+    #         .then(pl.lit(True))
+    #         .otherwise(pl.lit(False))
+    #         .alias("Terminüberschreitung"),
+    #         pl.when(
+    #             (pl.col("Liefertermin_Ist").is_not_null())
+    #             & (pl.col("Prod-Start").is_not_null())
+    #         )
+    #         .then((pl.col("Liefertermin_Ist") - pl.col("Prod-Start")).dt.total_days())
+    #         .otherwise(None)
+    #         .alias("Durchlaufzeit_Anzahl_Tage"),
+    #     )
+    #     .with_columns(
+    #         pl.when(
+    #             (pl.col("Durchlaufzeit_Anzahl_Tage").is_not_null())
+    #             & (pl.col("Durchlaufzeit_Anzahl_Tage") < 0)
+    #         )
+    #         .then(None)
+    #         .otherwise(pl.col("Durchlaufzeit_Anzahl_Tage"))
+    #         .alias("Durchlaufzeit_Anzahl_Tage")
+    #     )
+    # )
+
    return data


@@ -313,7 +356,7 @@ def _parse_to_json(


 def dump_order_level_to_internal_database_staging(
-    data: pl.DataFrame,
+    data: pl.LazyFrame,
 ) -> None:

    staging_data = data.with_columns(
@@ -324,6 +367,7 @@ def dump_order_level_to_internal_database_staging(
        )
        .name.keep()
    )
+    staging_data = staging_data.collect()
    rows_inserted = staging_data.write_database(
        "Produktionsauftrag-Einzelsicht_Staging",
        connection=db.DB_URI,
@@ -355,7 +399,7 @@ def dump_order_level_to_internal_database_staging(


 def dump_order_level_to_internal_database_wipe(
-    data: pl.DataFrame,
+    data: pl.LazyFrame,
 ) -> None:

    staging_data = data.with_columns(
@@ -370,6 +414,7 @@ def dump_order_level_to_internal_database_wipe(
    with db.ENGINE_INTERNAL.begin() as conn:
        conn.execute(sql.text('DELETE FROM "Produktionsauftrag-Einzelsicht";'))

+    staging_data = staging_data.collect()
    rows_inserted = staging_data.write_database(
        "Produktionsauftrag-Einzelsicht",
        connection=db.DB_URI,