add prototyping

2026-06-03 14:42:33 +02:00
parent 9743d41dfd
commit 8c6e36e43d
2 changed files with 229 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 # own
-prototypes/
+# prototypes/
 data/
 reports/
 *.code-workspace
--- a/prototypes/01_first-look_20260603.py
+++ b/prototypes/01_first-look_20260603.py
@@ -0,0 +1,228 @@
 # %%
 from pathlib import Path
 import polars as pl
 # %%
 PROJECT_BASE = Path(__file__).parents[1]
 DATA = PROJECT_BASE / "data"
 assert DATA.exists()
 # %%
 data_t1 = DATA / "PSM/20260507"
 assert data_t1.exists()
 # %%
 data_t1_jobs = data_t1 / "MIS-Auträge_22.csv"
 assert data_t1_jobs.exists()
 data_t1_PSM = data_t1 / "Produktionsstandsmeldungen.csv"
 assert data_t1_PSM.exists()
 # %%
 # // MIS-Aufträge
 pl.read_csv(data_t1_jobs, encoding="windows-1252", separator=";")
 # %%
 # // PSM
 schema_PSM: dict[str, type[pl.DataType]] = {
    "VK Auftrag": pl.UInt32,
    "Artikelbez.": pl.String,
    "Auftragsmenge": pl.UInt32,
    "Kunde": pl.String,
    "PA": pl.UInt64,
    "PA Pos": pl.UInt32,
    "PSM gemeldet am": pl.Datetime,
    "Konfektionär": pl.String,
    "Artikelnr.": pl.String,
    "LT Kunde bestätigt": pl.Date,
    "Export Ist": pl.Date,
    "1.bestät. Import Konfektionär": pl.Date,
    "Import Ist": pl.Date,
    "Ablief.(Import Ist+Transport)": pl.Date,
    "Wareneingang am": pl.Date,
    "Wareneingang geprüft": pl.String,
    "Täglicher Ausstoss": pl.Int64,
    "Zuschnitt am": pl.Date,
    "Teile in Zuschnitt": pl.UInt64,
    "Teile im Nähband": pl.UInt64,
    "Fertigware aus Nähband": pl.UInt64,
    "Teile kontrolliert": pl.UInt64,
    "Teile verpackt in Karton": pl.UInt64,
    "Anzahl Bänder": pl.UInt16,
    "Anzahl Näher": pl.UInt16,
    "Arbeitsstunden pro Näher": pl.UInt8,
    "Anzahl Arbeitstage pro Woche": pl.UInt8,
    "Blockauftrag": pl.String,
 }
 # psm = pl.read_csv(data_t1_PSM, encoding="windows-1252", separator=";")
 psm = pl.read_csv(
    data_t1_PSM,
    encoding="windows-1252",
    separator=";",
    schema_overrides=schema_PSM,
    null_values=["01.01.1111 00:00:00"],
 )
 # %%
 psm.filter(pl.col("Konfektionär").str.contains("MEMTEKS"))
 # %%
 # %%
 psm.estimated_size("mb")
 # %%
 regex_pattern = r"^[\s\-#+/$]+$"
 psm = psm.with_columns(
    pl.when(pl.col(pl.String).str.contains(regex_pattern))
    .then(None)
    .otherwise(pl.col(pl.String))
    .name.keep()
 )
 psm.filter((pl.col.PA == 17191) & (pl.col("PA Pos") == 10))
 # %%
 psm.estimated_size("mb")
 # %%
 psm.head()
 # %%
 psm.filter(pl.any_horizontal(pl.col("VK Auftrag").is_null()))
 # %%
 psm.filter(pl.col("Wareneingang am") == "01.01.1111 00:00:00").group_by(
    pl.col.Konfektionär
 ).agg(pl.len())
 # %%
 dupl_filter = psm.select([pl.col.PA, pl.col("PA Pos")]).is_duplicated()
 # %%
 psm.group_by(["PA", "PA Pos"]).agg(pl.col("PA").n_unique().alias("unique")).sort(
    "unique", descending=True
 )
 # %%
 most_occurrences = (
    psm.group_by(["PA", "PA Pos", "Konfektionär"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
 )
 most_occurrences
 # %%
 most_occurrences.filter(~pl.col("Konfektionär").str.contains("May Tekstil Camcesme"))
 # %%
 psm.filter((pl.col.PA == 16003) & (pl.col("PA Pos") == 10)).sort(
    "PSM gemeldet am", descending=False
 )
 # %%
 psm.filter((pl.col.PA == 17085) & (pl.col("PA Pos") == 10)).sort(
    "PSM gemeldet am", descending=False
 )
 # %%
 tmp = psm.filter((pl.col.PA == 15372) & (pl.col("PA Pos") == 10)).sort(
    "PSM gemeldet am", descending=False
 )
 tmp
 # %%
 # // simulate time series
 series: list[pl.DataFrame] = []
 for i in range(tmp.height):
    series.append(tmp[: (i + 1)])
 assert len(series) == tmp.height
 for idx, entry in enumerate(series, start=1):
    assert idx == entry.height
 # %%
 series[1]
 # %%
 tmp.columns
 # %%
 tmp = psm.filter((pl.col.PA == 16003) & (pl.col("PA Pos") == 10)).sort(
    "PSM gemeldet am", descending=False
 )
 # %%
 # // plausibility check
 # ** production quantities
 plausi_features_all = [
    "Teile in Zuschnitt",
    "Teile im Nähband",
    "Fertigware aus Nähband",
    "Teile kontrolliert",
    "Teile verpackt in Karton",
 ]
 plausi_features_endpoint_only = [
    "Teile in Zuschnitt",
    "Fertigware aus Nähband",
    "Teile kontrolliert",
    "Teile verpackt in Karton",
 ]
 plausi_features = plausi_features_all
 # plausi_features = plausi_features_endpoint_only
 # %%
 IDX = None
 if IDX is None:
    tmp_1 = tmp.select(plausi_features_all)
 else:
    tmp_1 = tmp[IDX].select(plausi_features_all)
 print(tmp_1)
 # %%
 # ** empty: default state
 tmp_1 = tmp_1.with_columns(
    pl.all_horizontal(pl.col("*").is_null() | (pl.col("*") == 0)).alias("is_empty")
 )
 # %%
 # tmp_1 = tmp_1.transpose()
 # %%
 # tmp_1.shift(1)
 # %%
 conditions = [
    pl.col(plausi_features[i]) >= pl.col(plausi_features[i + 1])
    for i in range(len(plausi_features) - 1)
 ]
 # 4. Filter anwenden
 # pl.all_horizontal stellt sicher, dass die Bedingung für JEDES Paar in der Zeile stimmt
 df_markiert = tmp_1.with_columns(
    pl.when(pl.all_horizontal(conditions) | pl.col("is_empty"))
    .then(pl.lit(True))
    .otherwise(pl.lit(False))
    .alias("Produktionsstückzahlen_valide")
 )
 print(df_markiert)
 # df_valide = tmp_1.filter(pl.all_horizontal(conditions))
 # df_invalide = tmp_1.filter(
 #     ~pl.all_horizontal(conditions)
 # )  # Das Tilde-Zeichen ~ bedeutet "NOT"
 # print("--- Valide Zeilen ---")
 # print(df_valide)
 # print("\n--- Invalide Zeilen ---")
 # print(df_invalide)
 # %%
 # 1. Testdaten erstellen (Zeile 0-2 sind valide, Zeile 3 ist dein invalides Beispiel)
 df = pl.DataFrame({"EP-1": [0, 100, 100, 0], "EP-2": [0, 0, 100, 100], "EP-3": [0, 0, 0, 0]})
 # 2. Liste der Erfassungspunkte in der richtigen (konsekutiven) Reihenfolge
 ep_spalten = ["EP-1", "EP-2", "EP-3"]
 # 3. Dynamisch die Bedingungen für alle Paare erstellen
 # Wir prüfen für jedes Paar: Ist der vorherige Punkt (i) >= dem nächsten Punkt (i+1)?
 bedingungen = [
    pl.col(ep_spalten[i]) >= pl.col(ep_spalten[i + 1]) for i in range(len(ep_spalten) - 1)
 ]
 # 4. Filter anwenden
 # pl.all_horizontal stellt sicher, dass die Bedingung für JEDES Paar in der Zeile stimmt
 df_valide = df.filter(pl.all_horizontal(bedingungen))
 df_invalide = df.filter(~pl.all_horizontal(bedingungen))  # Das Tilde-Zeichen ~ bedeutet "NOT"
 print("--- Valide Zeilen ---")
 print(df_valide)
 print("\n--- Invalide Zeilen ---")
 print(df_invalide)
 # %%