From 8c6e36e43dcfeeb0aa2e025e86302c6f2ce8e860 Mon Sep 17 00:00:00 2001
From: foefl <f.foerster@d-opt.com>
Date: Wed, 3 Jun 2026 14:42:33 +0200
Subject: [PATCH] add prototyping

---
 .gitignore                           |   2 +-
 prototypes/01_first-look_20260603.py | 228 +++++++++++++++++++++++++++
 2 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 prototypes/01_first-look_20260603.py

diff --git a/.gitignore b/.gitignore
index 7ebd259..44bff78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 # own
-prototypes/
+# prototypes/
 data/
 reports/
 *.code-workspace
diff --git a/prototypes/01_first-look_20260603.py b/prototypes/01_first-look_20260603.py
new file mode 100644
index 0000000..4d53dc8
--- /dev/null
+++ b/prototypes/01_first-look_20260603.py
@@ -0,0 +1,228 @@
+# %%
+from pathlib import Path
+
+import polars as pl
+
+# %%
+PROJECT_BASE = Path(__file__).parents[1]
+DATA = PROJECT_BASE / "data"
+assert DATA.exists()
+# %%
+data_t1 = DATA / "PSM/20260507"
+assert data_t1.exists()
+# %%
+data_t1_jobs = data_t1 / "MIS-Auträge_22.csv"
+assert data_t1_jobs.exists()
+data_t1_PSM = data_t1 / "Produktionsstandsmeldungen.csv"
+assert data_t1_PSM.exists()
+# %%
+# // MIS-Aufträge
+pl.read_csv(data_t1_jobs, encoding="windows-1252", separator=";")
+
+# %%
+# // PSM
+schema_PSM: dict[str, type[pl.DataType]] = {
+    "VK Auftrag": pl.UInt32,
+    "Artikelbez.": pl.String,
+    "Auftragsmenge": pl.UInt32,
+    "Kunde": pl.String,
+    "PA": pl.UInt64,
+    "PA Pos": pl.UInt32,
+    "PSM gemeldet am": pl.Datetime,
+    "Konfektionär": pl.String,
+    "Artikelnr.": pl.String,
+    "LT Kunde bestätigt": pl.Date,
+    "Export Ist": pl.Date,
+    "1.bestät. Import Konfektionär": pl.Date,
+    "Import Ist": pl.Date,
+    "Ablief.(Import Ist+Transport)": pl.Date,
+    "Wareneingang am": pl.Date,
+    "Wareneingang geprüft": pl.String,
+    "Täglicher Ausstoss": pl.Int64,
+    "Zuschnitt am": pl.Date,
+    "Teile in Zuschnitt": pl.UInt64,
+    "Teile im Nähband": pl.UInt64,
+    "Fertigware aus Nähband": pl.UInt64,
+    "Teile kontrolliert": pl.UInt64,
+    "Teile verpackt in Karton": pl.UInt64,
+    "Anzahl Bänder": pl.UInt16,
+    "Anzahl Näher": pl.UInt16,
+    "Arbeitsstunden pro Näher": pl.UInt8,
+    "Anzahl Arbeitstage pro Woche": pl.UInt8,
+    "Blockauftrag": pl.String,
+}
+
+# psm = pl.read_csv(data_t1_PSM, encoding="windows-1252", separator=";")
+psm = pl.read_csv(
+    data_t1_PSM,
+    encoding="windows-1252",
+    separator=";",
+    schema_overrides=schema_PSM,
+    null_values=["01.01.1111 00:00:00"],
+)
+# %%
+psm.filter(pl.col("Konfektionär").str.contains("MEMTEKS"))
+# %%
+
+# %%
+psm.estimated_size("mb")
+
+# %%
+regex_pattern = r"^[\s\-#+/$]+$"
+psm = psm.with_columns(
+    pl.when(pl.col(pl.String).str.contains(regex_pattern))
+    .then(None)
+    .otherwise(pl.col(pl.String))
+    .name.keep()
+)
+psm.filter((pl.col.PA == 17191) & (pl.col("PA Pos") == 10))
+
+# %%
+psm.estimated_size("mb")
+
+# %%
+psm.head()
+# %%
+psm.filter(pl.any_horizontal(pl.col("VK Auftrag").is_null()))
+
+# %%
+psm.filter(pl.col("Wareneingang am") == "01.01.1111 00:00:00").group_by(
+    pl.col.Konfektionär
+).agg(pl.len())
+
+# %%
+dupl_filter = psm.select([pl.col.PA, pl.col("PA Pos")]).is_duplicated()
+# %%
+psm.group_by(["PA", "PA Pos"]).agg(pl.col("PA").n_unique().alias("unique")).sort(
+    "unique", descending=True
+)
+# %%
+most_occurrences = (
+    psm.group_by(["PA", "PA Pos", "Konfektionär"])
+    .agg(pl.len().alias("count"))
+    .sort("count", descending=True)
+)
+most_occurrences
+# %%
+most_occurrences.filter(~pl.col("Konfektionär").str.contains("May Tekstil Camcesme"))
+# %%
+psm.filter((pl.col.PA == 16003) & (pl.col("PA Pos") == 10)).sort(
+    "PSM gemeldet am", descending=False
+)
+
+
+# %%
+psm.filter((pl.col.PA == 17085) & (pl.col("PA Pos") == 10)).sort(
+    "PSM gemeldet am", descending=False
+)
+# %%
+tmp = psm.filter((pl.col.PA == 15372) & (pl.col("PA Pos") == 10)).sort(
+    "PSM gemeldet am", descending=False
+)
+tmp
+# %%
+# // simulate time series
+series: list[pl.DataFrame] = []
+
+for i in range(tmp.height):
+    series.append(tmp[: (i + 1)])
+
+assert len(series) == tmp.height
+
+for idx, entry in enumerate(series, start=1):
+    assert idx == entry.height
+# %%
+series[1]
+# %%
+tmp.columns
+# %%
+tmp = psm.filter((pl.col.PA == 16003) & (pl.col("PA Pos") == 10)).sort(
+    "PSM gemeldet am", descending=False
+)
+# %%
+# // plausibility check
+# ** production quantities
+plausi_features_all = [
+    "Teile in Zuschnitt",
+    "Teile im Nähband",
+    "Fertigware aus Nähband",
+    "Teile kontrolliert",
+    "Teile verpackt in Karton",
+]
+plausi_features_endpoint_only = [
+    "Teile in Zuschnitt",
+    "Fertigware aus Nähband",
+    "Teile kontrolliert",
+    "Teile verpackt in Karton",
+]
+plausi_features = plausi_features_all
+# plausi_features = plausi_features_endpoint_only
+# %%
+IDX = None
+if IDX is None:
+    tmp_1 = tmp.select(plausi_features_all)
+else:
+    tmp_1 = tmp[IDX].select(plausi_features_all)
+print(tmp_1)
+# %%
+# ** empty: default state
+tmp_1 = tmp_1.with_columns(
+    pl.all_horizontal(pl.col("*").is_null() | (pl.col("*") == 0)).alias("is_empty")
+)
+# %%
+# tmp_1 = tmp_1.transpose()
+# %%
+# tmp_1.shift(1)
+
+# %%
+conditions = [
+    pl.col(plausi_features[i]) >= pl.col(plausi_features[i + 1])
+    for i in range(len(plausi_features) - 1)
+]
+
+# 4. Filter anwenden
+# pl.all_horizontal stellt sicher, dass die Bedingung für JEDES Paar in der Zeile stimmt
+df_markiert = tmp_1.with_columns(
+    pl.when(pl.all_horizontal(conditions) | pl.col("is_empty"))
+    .then(pl.lit(True))
+    .otherwise(pl.lit(False))
+    .alias("Produktionsstückzahlen_valide")
+)
+print(df_markiert)
+
+# df_valide = tmp_1.filter(pl.all_horizontal(conditions))
+# df_invalide = tmp_1.filter(
+#     ~pl.all_horizontal(conditions)
+# )  # Das Tilde-Zeichen ~ bedeutet "NOT"
+
+# print("--- Valide Zeilen ---")
+# print(df_valide)
+
+# print("\n--- Invalide Zeilen ---")
+# print(df_invalide)
+
+
+# %%
+# 1. Testdaten erstellen (Zeile 0-2 sind valide, Zeile 3 ist dein invalides Beispiel)
+df = pl.DataFrame({"EP-1": [0, 100, 100, 0], "EP-2": [0, 0, 100, 100], "EP-3": [0, 0, 0, 0]})
+
+# 2. Liste der Erfassungspunkte in der richtigen (konsekutiven) Reihenfolge
+ep_spalten = ["EP-1", "EP-2", "EP-3"]
+
+# 3. Dynamisch die Bedingungen für alle Paare erstellen
+# Wir prüfen für jedes Paar: Ist der vorherige Punkt (i) >= dem nächsten Punkt (i+1)?
+bedingungen = [
+    pl.col(ep_spalten[i]) >= pl.col(ep_spalten[i + 1]) for i in range(len(ep_spalten) - 1)
+]
+
+# 4. Filter anwenden
+# pl.all_horizontal stellt sicher, dass die Bedingung für JEDES Paar in der Zeile stimmt
+df_valide = df.filter(pl.all_horizontal(bedingungen))
+df_invalide = df.filter(~pl.all_horizontal(bedingungen))  # Das Tilde-Zeichen ~ bedeutet "NOT"
+
+print("--- Valide Zeilen ---")
+print(df_valide)
+
+print("\n--- Invalide Zeilen ---")
+print(df_invalide)
+# %%