diff --git a/prototypes/02_integrate_wokflow.py b/prototypes/02-1_integrate_wokflow.py similarity index 100% rename from prototypes/02_integrate_wokflow.py rename to prototypes/02-1_integrate_wokflow.py diff --git a/prototypes/02-2_aggregates.py b/prototypes/02-2_aggregates.py new file mode 100644 index 0000000..d419ef5 --- /dev/null +++ b/prototypes/02-2_aggregates.py @@ -0,0 +1,114 @@ +# %% +import datetime +import importlib +from pathlib import Path + +import external_code +import polars as pl +import sqlalchemy as sql + +from wattanalyse import db + +importlib.reload(db) +importlib.reload(external_code) +# %% +PROJECT_BASE = Path(__file__).parents[1] +DATA_PTH = PROJECT_BASE / "data" +assert DATA_PTH.exists() + +# %% +# // load data +target = DATA_PTH / "PSM_20260507.arrow" +data_raw = pl.scan_ipc(target) + +# %% +# 0. read data (from customer's database) +# 1. cleanup obtained new data +# ~~2. load data from internal database~~ +# ~~3. integrate with with new data (whole snapshot)~~ +# 2. process on order level +# 3. save results to internal database +# 4. post-process results +# 5. write to external database + +# // (1) cleanup obtained new data +# load data from internal database +# integrate with with new data (whole snapshot) +res = external_code.preprocess_psm(data_raw) +data = res.data + +print(f"Data:\n{data.collect()}\n\n---\n\nFiltered:\n{res.filtered}") + +# %% +# // (2) processing order level +df = external_code.process_order_level(data) + + +# ?? What is if "Konfektionär" is NULL? +# If this is NULL, then the aggregates for "Konfektionär" will not work. Instead, they are +# calculated for all NULL entries which might incorporate different production orders which +# belong to different "Konfektionär". Thus, these values will be calculated, but should not be +# considered. + +# %% +# // (3) save results to internal database +external_code.dump_order_level_to_internal_database_wipe(df) +# %% +# now load data from database +df = external_code.load_order_level_from_internal_database() +df +# %% +tmp = df.clone() + +# two ways to define the aggregate for date deviations: just use < 0 or use Boolean flag +# defined by the user-specified boundaries +USE_BOUNDARIES = False +filter_date_deviation_early: pl.Expr +filter_date_deviation_late: pl.Expr +if USE_BOUNDARIES: + filter_date_deviation_early = pl.col("Terminunterschreitung") + filter_date_deviation_late = pl.col("Terminüberschreitung") +else: + filter_date_deviation_early = pl.col("Terminabweichung_Anzahl_Tage") < 0 + filter_date_deviation_late = pl.col("Terminabweichung_Anzahl_Tage") > 0 + + +tmp.select( + pl.col("Terminabweichung_Anzahl_Tage") + .filter(filter_date_deviation_early) + .mean() + .abs() + .round(mode="half_away_from_zero") + .cast(pl.Int64) + .alias("Mittlere_Tage_Unterschreitung"), + pl.col("Terminabweichung_Anzahl_Tage") + .filter(filter_date_deviation_late) + .mean() + .abs() + .round(mode="half_away_from_zero") + .cast(pl.Int64) + .alias("Mittlere_Tage_Ueberschreitung"), + pl.col("Terminabweichung_Anzahl_Tage") + .std(ddof=1) + .alias("Standardabweichung_Lieferterminabweichung"), + pl.col("Import-Ist_Anzahl_Aenderungen") + .mean() + .abs() + .round(mode="half_away_from_zero") + .cast(pl.Int64) + .alias("Mittlere_Anzahl_Anpassungen_Liefertermin"), + pl.col("Tage_zu_letzter_PSM_Historie") + .list.explode() + .mean() + .abs() + .round(mode="half_away_from_zero") + .cast(pl.Int64) + .alias("Mittlere_Abstaende_PSM"), + pl.col("Durchlaufzeit_Anzahl_Tage") + .mean() + .round(mode="half_away_from_zero") + .cast(pl.Int64) + .alias("Mittlere_Durchlaufzeit"), +) + +# %%