add preprocessing steps

This commit is contained in:
Florian Förster 2025-02-27 13:13:29 +01:00
parent 83d0691d67
commit 5e5486fe53
6 changed files with 136 additions and 8 deletions

View File

@ -1,19 +1,34 @@
from __future__ import annotations from __future__ import annotations
import dataclasses as dc import dataclasses as dc
from collections.abc import Mapping, Set
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import pandas as pd import pandas as pd
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor from xgboost import XGBRegressor
from delta_barth.analysis import parse
from delta_barth.constants import COL_MAP_SALES_PROGNOSIS, FEATURES_SALES_PROGNOSIS
from delta_barth.types import CustomerDataSalesForecast, FcErrorCodes from delta_barth.types import CustomerDataSalesForecast, FcErrorCodes
if TYPE_CHECKING: if TYPE_CHECKING:
from delta_barth.api.common import SalesPrognosisResponse
from delta_barth.types import FcResult from delta_barth.types import FcResult
# TODO check pandera for DataFrame validation # TODO check pandera for DataFrame validation
def parse_api_resp_to_df(
resp: SalesPrognosisResponse,
) -> pd.DataFrame:
if resp.error is not None:
raise ValueError("Response contains error code. Parsing aborted.")
data = resp.model_dump()["daten"]
return pd.DataFrame(data)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Input: # Input:
# DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp # DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp
@ -28,8 +43,22 @@ if TYPE_CHECKING:
# Prognose Umsatz je Firma # Prognose Umsatz je Firma
# TODO: check usage of separate exception and handle it in API function # TODO: check usage of separate exception and handle it in API function
# TODO set min number of data points as constant, not parameter # TODO set min number of data points as constant, not parameter
def preprocess_sales_per_customer(
resp: SalesPrognosisResponse,
feature_map: Mapping[str, str],
target_features: Set[str],
) -> pd.DataFrame:
df = parse_api_resp_to_df(resp)
df = parse.preprocess_features(
df,
feature_map=feature_map,
target_features=target_features,
)
return df
def sales_per_customer( def sales_per_customer(

View File

@ -9,9 +9,9 @@ if TYPE_CHECKING:
import pandas as pd import pandas as pd
def check_needed_features( def _check_needed_features(
data: pd.DataFrame, data: pd.DataFrame,
features: Set, features: Set[str],
) -> None: ) -> None:
data_feats = set(data.columns) data_feats = set(data.columns)
missing_features = features - data_feats missing_features = features - data_feats
@ -23,10 +23,11 @@ def check_needed_features(
) )
def map_features_to_targets( def _map_features_to_targets(
data: pd.DataFrame, data: pd.DataFrame,
feature_map: Mapping[str, str], feature_map: Mapping[str, str],
) -> pd.DataFrame: ) -> pd.DataFrame:
data = data.copy()
data_feats = data.columns data_feats = data.columns
mapped_feats: list[str] = [] mapped_feats: list[str] = []
@ -39,3 +40,14 @@ def map_features_to_targets(
data.columns = mapped_feats data.columns = mapped_feats
return data return data
def preprocess_features(
data: pd.DataFrame,
feature_map: Mapping[str, str],
target_features: Set[str],
) -> pd.DataFrame:
data = _map_features_to_targets(data, feature_map)
_check_needed_features(data, target_features)
return data

Binary file not shown.

View File

@ -1,3 +1,5 @@
import pytest
from delta_barth.analysis import forecast as fc from delta_barth.analysis import forecast as fc
@ -15,3 +17,44 @@ def test_sales_per_customer_too_few_data_points(sales_data):
assert err == 1 assert err == 1
assert res is None assert res is None
def test_parse_api_resp_to_df(exmpl_api_sales_prognosis_resp):
resp = exmpl_api_sales_prognosis_resp
df = fc.parse_api_resp_to_df(resp)
features = set(
(
"artikelId",
"warengruppeId",
"firmaId",
"betrag",
"menge",
"buchungsDatum",
)
)
assert all(col in features for col in df.columns)
def test_preprocess_sales_per_customer(exmpl_api_sales_prognosis_resp):
resp = exmpl_api_sales_prognosis_resp
feat_mapping: dict[str, str] = {
"artikelId": "artikel_refid",
"firmaId": "firma_refid",
"betrag": "betrag",
"menge": "menge",
"buchungsDatum": "buchungs_datum",
}
target_features: frozenset[str] = frozenset(
(
"firma_refid",
"betrag",
"buchungs_datum",
)
)
df = fc.preprocess_sales_per_customer(
resp,
feature_map=feat_mapping,
target_features=target_features,
)
assert len(df.columns) == 5
assert any(feat not in df.columns for feat in feat_mapping.keys())

View File

@ -1,7 +1,7 @@
import pandas as pd import pandas as pd
import pytest import pytest
from delta_barth.analysis import parse from delta_barth.analysis import forecast, parse
from delta_barth.errors import FeaturesMissingError from delta_barth.errors import FeaturesMissingError
@ -10,12 +10,12 @@ def test_check_needed_features():
data = pd.DataFrame( data = pd.DataFrame(
data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"] data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"]
) )
parse.check_needed_features(data, target_features) parse._check_needed_features(data, target_features)
data = pd.DataFrame( data = pd.DataFrame(
data=[[1, 2, 3, 4, 5]], columns=["featX", "feat2", "feat3", "feat4", "feat5"] data=[[1, 2, 3, 4, 5]], columns=["featX", "feat2", "feat3", "feat4", "feat5"]
) )
with pytest.raises(FeaturesMissingError): with pytest.raises(FeaturesMissingError):
parse.check_needed_features(data, target_features) parse._check_needed_features(data, target_features)
def test_map_features_to_targets(): def test_map_features_to_targets():
@ -23,7 +23,7 @@ def test_map_features_to_targets():
data = pd.DataFrame( data = pd.DataFrame(
data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"] data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"]
) )
data = parse.map_features_to_targets(data, feature_map) data = parse._map_features_to_targets(data, feature_map)
assert "feat10" in data.columns assert "feat10" in data.columns
assert "feat20" in data.columns assert "feat20" in data.columns
assert "feat50" in data.columns assert "feat50" in data.columns
@ -32,3 +32,28 @@ def test_map_features_to_targets():
assert "feat1" not in data.columns assert "feat1" not in data.columns
assert "feat2" not in data.columns assert "feat2" not in data.columns
assert "feat5" not in data.columns assert "feat5" not in data.columns
def test_preprocess_features(exmpl_api_sales_prognosis_resp):
resp = exmpl_api_sales_prognosis_resp
df = forecast.parse_api_resp_to_df(resp)
feat_mapping: dict[str, str] = {
"artikelId": "artikel_refid",
"firmaId": "firma_refid",
"betrag": "betrag",
"menge": "menge",
"buchungsDatum": "buchungs_datum",
}
target_features: frozenset[str] = frozenset(
(
"firma_refid",
"betrag",
"buchungs_datum",
)
)
assert all(feat in df.columns for feat in feat_mapping.keys())
data = parse.preprocess_features(df, feat_mapping, target_features)
assert len(data.columns) == len(df.columns)
assert (data.columns != df.columns).any()
assert any(feat not in data.columns for feat in feat_mapping.keys())

View File

@ -1,10 +1,16 @@
from __future__ import annotations
import pickle
import tomllib import tomllib
from pathlib import Path from pathlib import Path
from typing import Any, cast from typing import TYPE_CHECKING, Any, cast
import pandas as pd import pandas as pd
import pytest import pytest
if TYPE_CHECKING:
from delta_barth.api.common import SalesPrognosisResponse
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def credentials() -> dict[str, str]: def credentials() -> dict[str, str]:
@ -56,3 +62,16 @@ def sales_data() -> pd.DataFrame:
data["buchungs_datum"] = pd.to_datetime(data["buchungs_datum"]) data["buchungs_datum"] = pd.to_datetime(data["buchungs_datum"])
return data return data
@pytest.fixture(scope="session")
def exmpl_api_sales_prognosis_resp() -> SalesPrognosisResponse:
pwd = Path.cwd()
assert "barth" in pwd.parent.name.lower(), "not in project root directory"
data_pth = pwd / "./tests/_test_data/exmp_sales_prognosis_resp.pkl"
assert data_pth.exists(), "file to API sales data not found"
with open(data_pth, "rb") as file:
data = cast("SalesPrognosisResponse", pickle.load(file))
return data