construct base pipeline with "run" function

This commit is contained in:
2026-06-10 16:48:03 +02:00
parent b66d5a4921
commit 5e15c99520
6 changed files with 844 additions and 50 deletions

View File

@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, Final, TypeAlias, cast
import polars as pl
import sqlalchemy as sql
from dopt_basics.datastructures import flatten
from wattanalyse import db
@@ -30,7 +31,13 @@ SqlStatement: TypeAlias = str
@dc.dataclass(slots=True, eq=False)
class PreProcessResult:
data: pl.LazyFrame
filtered: pl.DataFrame
filtered: pl.LazyFrame
DROP_COLUMNS: Final[list[str]] = cast(
list[str],
list(flatten(((x.lower(), x.upper(), x.capitalize()) for x in ("id", "index", "idx")))),
)
@dc.dataclass(slots=True, kw_only=True)
@@ -51,7 +58,7 @@ PSM_SCORES: dict[QualityPsm, int] = {
QualityPsm.PLAUSIBEL: 2,
}
RENAMING_SCHEME: dict[str, str] = {
RENAMING_SCHEME_PSM: dict[str, str] = {
"PA Pos": "PA_Pos",
"PSM gemeldet am": "Meldezeitpunkt_Historie",
"Import Ist": "Import-Ist_Historie",
@@ -62,6 +69,8 @@ RENAMING_SCHEME: dict[str, str] = {
"Fertigware aus Nähband": "Prod-EP30_Historie",
"Teile kontrolliert": "Prod-EP40_Historie",
"Teile verpackt in Karton": "Prod-EP50_Historie",
"Konfektionär": "Konfektionaer",
"Lieferantnr.": "Konfektionaer_ID",
}
PRIM_KEYS: Final[list[str]] = ["PA", "PA_Pos"]
@@ -91,7 +100,8 @@ def load_PSM_data(
def preprocess_psm(
data: pl.LazyFrame,
) -> PreProcessResult:
data = data.rename(RENAMING_SCHEME)
data = data.rename(RENAMING_SCHEME_PSM)
data = data.drop(DROP_COLUMNS, strict=False)
REGEX_PATTERN = r"^[\s\-#+/$]+$"
data = data.with_columns(
pl.when(pl.col(pl.String).str.contains(REGEX_PATTERN))
@@ -99,7 +109,7 @@ def preprocess_psm(
.otherwise(pl.col(pl.String))
.name.keep()
)
data = data.with_columns(pl.col("Konfektionär").str.strip_chars(" \n\t"))
data = data.with_columns(pl.col("Konfektionaer").str.strip_chars(" \n\t"))
filtered_data = pl.LazyFrame(schema=data.collect_schema())
# drop duplicates
@@ -161,7 +171,7 @@ def preprocess_psm(
filtered_data = pl.concat([filtered_data, data.filter(pl.any_horizontal(cond))])
data = data.filter(~pl.any_horizontal(cond))
return PreProcessResult(data=data, filtered=filtered_data.collect())
return PreProcessResult(data=data, filtered=filtered_data)
# // (2) process on order level
@@ -169,7 +179,6 @@ def process_order_level(
data: pl.LazyFrame,
) -> pl.LazyFrame:
# ** renaming
# data = data.rename(RENAMING_SCHEME) # TODO delete, done in pre-processing
data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie"], descending=False)
# ** plausibility check of order quantities
@@ -272,7 +281,7 @@ def process_order_level(
# whole aggregates see DB schema
data = (
data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie"], descending=False)
.group_by(PRIM_KEYS + ["Konfektionär"])
.group_by(PRIM_KEYS + ["Konfektionaer", "Konfektionaer_ID"])
.agg(
pl.col("Meldezeitpunkt_Historie"),
pl.col("Liefertermin_Soll").drop_nulls().first(),
@@ -508,7 +517,7 @@ def aggregate_production_orders(
def aggregate_suppliers(
data: pl.LazyFrame,
) -> pl.LazyFrame:
data = data.group_by("Konfektionär").agg(
data = data.group_by(["Konfektionaer", "Konfektionaer_ID"]).agg(
(
(
~(filter_date_deviation_early | filter_date_deviation_late)
@@ -573,8 +582,6 @@ def aggregate_suppliers(
# // (5) external database
def oracle_prepare_KPI_aggregate(
data: pl.LazyFrame,
rename_schema: dict[str, str] | None = None,
@@ -599,6 +606,7 @@ def oracle_prepare_KPI_aggregate(
pl.all().exclude(pl.Boolean),
)
.select(cols_sorted)
.select(pl.all().name.to_uppercase())
)
return data