prepare enhanced pipeline

This commit is contained in:
2026-06-10 13:32:39 +02:00
parent 59148aaaf3
commit b66d5a4921
2 changed files with 133 additions and 42 deletions

View File

@@ -0,0 +1,99 @@
# %%
import importlib
from pathlib import Path
from pprint import pprint
import external_code
import oracledb
import polars as pl
import wattanalyse
from wattanalyse import constants, db
importlib.reload(wattanalyse)
importlib.reload(constants)
importlib.reload(external_code)
importlib.reload(db)
PROJECT_BASE = Path(__file__).parents[1]
DATA_PTH = PROJECT_BASE / "data"
assert DATA_PTH.exists()
# %%
conn = oracledb.connect(
user=constants.USER_CFG.Datenbank.NUTZER,
password=constants.USER_CFG.Datenbank.PASSWORT,
host=constants.USER_CFG.Datenbank.HOST,
port=constants.USER_CFG.Datenbank.PORT,
service_name=constants.USER_CFG.Datenbank.SERVICE_NAME,
)
#####################################
# // Get data from database
# %%
schema = db.extern_MIS_t_schema
data_mis = external_code.oracle_load_table_as_polars(
conn,
schema=schema,
table_name="EXTERN_MIS",
).collect()
data_mis
# %%
schema = db.extern_prod_order_t_schema
data_psm = external_code.oracle_load_table_as_polars(
conn,
schema=schema,
table_name="EXTERN_PSM",
).collect()
data_psm
# %%
data_mis = data_mis.drop("ID", strict=False)
data_psm = data_psm.drop("ID", strict=False)
# %%
data_psm.height
# %%
data_psm.join(data_mis, on=["PA", "PA Pos"], how="semi")
# %%
#
tab_name_psm = "EXTERN_PSM"
tab_name_mis = "EXTERN_MIS"
stmt = f"""
SELECT t1.* FROM "{tab_name_psm}" t1
WHERE EXISTS(
SELECT 1 FROM "{tab_name_mis}" t2
WHERE t1."PA" = t2."PA" AND t1."PA Pos" = t2."PA Pos"
)
"""
# test = external_code.oracle_load_table_as_polars(
# conn, db.extern_prod_order_t_schema, "", None
# ).collect()
test = external_code.oracle_load_table_as_polars(
conn, db.extern_prod_order_t_schema, tab_name_psm, stmt
).collect()
# %%
# data_psm = external_code.load_PSM_data(conn).collect()
# %%
# // preprocess data
# TODO: add check with MIS data if the orders are relevant
tmp = data_psm.clone()
res = external_code.preprocess_psm(tmp.lazy())
tmp = res.data
tmp = tmp.collect()
tmp
# %%
tmp = tmp.rename({"PA_Pos": "PA Pos"})
# %%
tmp.join(data_mis, on=["PA", "PA Pos"], how="semi")
# %%
res.filtered
# %%
tmp = data_psm.clone()
tmp = external_code.aggregate_production_orders(tmp.lazy()).collect()
print(tmp)
tmp = external_code.oracle_prepare_KPI_aggregate(tmp.lazy()).collect()
print(tmp)

View File

@@ -4,7 +4,8 @@ import dataclasses as dc
import datetime import datetime
import enum import enum
import json import json
from typing import TYPE_CHECKING, Any, Final, cast import warnings
from typing import TYPE_CHECKING, Any, Final, TypeAlias, cast
import polars as pl import polars as pl
import sqlalchemy as sql import sqlalchemy as sql
@@ -13,6 +14,7 @@ from wattanalyse import db
if TYPE_CHECKING: if TYPE_CHECKING:
from oracledb import Connection as OracleConnection from oracledb import Connection as OracleConnection
from polars._typing import SchemaDict
# 1. cleanup obtained new data # 1. cleanup obtained new data
# ~~2. load data from internal database~~ # ~~2. load data from internal database~~
@@ -22,6 +24,8 @@ if TYPE_CHECKING:
# 4. post-process results # 4. post-process results
# 5. write to external database # 5. write to external database
SqlStatement: TypeAlias = str
@dc.dataclass(slots=True, eq=False) @dc.dataclass(slots=True, eq=False)
class PreProcessResult: class PreProcessResult:
@@ -65,6 +69,22 @@ PRIM_KEYS: Final[list[str]] = ["PA", "PA_Pos"]
LOWER_BOUND_DATE_DEVIATION: Final[int] = 0 LOWER_BOUND_DATE_DEVIATION: Final[int] = 0
UPPER_BOUND_DATE_DEVIATION: Final[int] = 0 UPPER_BOUND_DATE_DEVIATION: Final[int] = 0
NUMBER_YEARS_UPPER_BOUND_DATES: Final[int] = 4 NUMBER_YEARS_UPPER_BOUND_DATES: Final[int] = 4
TAB_NAME_PSM: Final[str] = "EXTERN_PSM"
TAB_NAME_MIS: Final[str] = "EXTERN_MIS"
# // (0) load data
def load_PSM_data(
conn: OracleConnection,
) -> pl.LazyFrame:
stmt = f"""
SELECT t1.* FROM "{TAB_NAME_PSM}" t1
WHERE EXISTS(
SELECT 1 FROM "{TAB_NAME_MIS}" t2
WHERE t1."PA" = t2."PA" AND t1."PA Pos" = t2."PA Pos"
)
"""
return oracle_load_table_as_polars(conn, db.extern_prod_order_t_schema, None, stmt)
# // (1) preprocess # // (1) preprocess
@@ -306,44 +326,6 @@ def process_order_level(
) )
) )
# data = (
# data.with_columns(
# pl.when(
# (pl.col("Liefertermin_Ist").is_not_null())
# & (pl.col("Liefertermin_Soll").is_not_null())
# )
# .then((pl.col("Liefertermin_Ist") - pl.col("Liefertermin_Soll")).dt.total_days())
# .otherwise(None)
# .alias("Terminabweichung_Anzahl_Tage")
# )
# .with_columns(
# pl.when(pl.col("Terminabweichung_Anzahl_Tage") < LOWER_BOUND_DATE_DEVIATION)
# .then(pl.lit(True))
# .otherwise(pl.lit(False))
# .alias("Terminunterschreitung"),
# pl.when(pl.col("Terminabweichung_Anzahl_Tage") > UPPER_BOUND_DATE_DEVIATION)
# .then(pl.lit(True))
# .otherwise(pl.lit(False))
# .alias("Terminüberschreitung"),
# pl.when(
# (pl.col("Liefertermin_Ist").is_not_null())
# & (pl.col("Prod-Start").is_not_null())
# )
# .then((pl.col("Liefertermin_Ist") - pl.col("Prod-Start")).dt.total_days())
# .otherwise(None)
# .alias("Durchlaufzeit_Anzahl_Tage"),
# )
# .with_columns(
# pl.when(
# (pl.col("Durchlaufzeit_Anzahl_Tage").is_not_null())
# & (pl.col("Durchlaufzeit_Anzahl_Tage") < 0)
# )
# .then(None)
# .otherwise(pl.col("Durchlaufzeit_Anzahl_Tage"))
# .alias("Durchlaufzeit_Anzahl_Tage")
# )
# )
return data return data
@@ -637,11 +619,21 @@ def oracle_generate_sql_insert(
def oracle_load_table_as_polars( def oracle_load_table_as_polars(
conn: OracleConnection, conn: OracleConnection,
table_name: str, schema: SchemaDict | None,
table_name: str | None = None,
stmt: SqlStatement | None = None,
) -> pl.LazyFrame: ) -> pl.LazyFrame:
stmt = f"SELECT * FROM {table_name}" if not any((table_name, stmt)):
raise ValueError("Table name or SQL statement must be provided")
if all((table_name, stmt)):
warnings.warn(
"Table name and SQL statement provided. In this case, the statement is used."
)
if not stmt:
stmt = f"SELECT * FROM {table_name}"
odf = conn.fetch_df_all(statement=stmt) odf = conn.fetch_df_all(statement=stmt)
df = cast(pl.DataFrame, pl.from_arrow(odf)) df = cast(pl.DataFrame, pl.from_arrow(odf, schema_overrides=schema))
return df.lazy() return df.lazy()