add preprocessing steps

This commit is contained in:
2025-02-27 13:13:29 +01:00
parent 83d0691d67
commit 5e5486fe53
6 changed files with 136 additions and 8 deletions

View File

@@ -1,19 +1,34 @@
from __future__ import annotations
import dataclasses as dc
from collections.abc import Mapping, Set
from typing import TYPE_CHECKING
import pandas as pd
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from delta_barth.analysis import parse
from delta_barth.constants import COL_MAP_SALES_PROGNOSIS, FEATURES_SALES_PROGNOSIS
from delta_barth.types import CustomerDataSalesForecast, FcErrorCodes
if TYPE_CHECKING:
from delta_barth.api.common import SalesPrognosisResponse
from delta_barth.types import FcResult
# TODO check pandera for DataFrame validation
def parse_api_resp_to_df(
resp: SalesPrognosisResponse,
) -> pd.DataFrame:
if resp.error is not None:
raise ValueError("Response contains error code. Parsing aborted.")
data = resp.model_dump()["daten"]
return pd.DataFrame(data)
# ------------------------------------------------------------------------------
# Input:
# DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp
@@ -28,8 +43,22 @@ if TYPE_CHECKING:
# Prognose Umsatz je Firma
# TODO: check usage of separate exception and handle it in API function
# TODO set min number of data points as constant, not parameter
def preprocess_sales_per_customer(
resp: SalesPrognosisResponse,
feature_map: Mapping[str, str],
target_features: Set[str],
) -> pd.DataFrame:
df = parse_api_resp_to_df(resp)
df = parse.preprocess_features(
df,
feature_map=feature_map,
target_features=target_features,
)
return df
def sales_per_customer(

View File

@@ -9,9 +9,9 @@ if TYPE_CHECKING:
import pandas as pd
def check_needed_features(
def _check_needed_features(
data: pd.DataFrame,
features: Set,
features: Set[str],
) -> None:
data_feats = set(data.columns)
missing_features = features - data_feats
@@ -23,10 +23,11 @@ def check_needed_features(
)
def map_features_to_targets(
def _map_features_to_targets(
data: pd.DataFrame,
feature_map: Mapping[str, str],
) -> pd.DataFrame:
data = data.copy()
data_feats = data.columns
mapped_feats: list[str] = []
@@ -39,3 +40,14 @@ def map_features_to_targets(
data.columns = mapped_feats
return data
def preprocess_features(
data: pd.DataFrame,
feature_map: Mapping[str, str],
target_features: Set[str],
) -> pd.DataFrame:
data = _map_features_to_targets(data, feature_map)
_check_needed_features(data, target_features)
return data