add preprocessing steps
This commit is contained in:
parent
83d0691d67
commit
5e5486fe53
@ -1,19 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses as dc
|
||||
from collections.abc import Mapping, Set
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from xgboost import XGBRegressor
|
||||
|
||||
from delta_barth.analysis import parse
|
||||
from delta_barth.constants import COL_MAP_SALES_PROGNOSIS, FEATURES_SALES_PROGNOSIS
|
||||
from delta_barth.types import CustomerDataSalesForecast, FcErrorCodes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from delta_barth.api.common import SalesPrognosisResponse
|
||||
from delta_barth.types import FcResult
|
||||
|
||||
# TODO check pandera for DataFrame validation
|
||||
|
||||
|
||||
def parse_api_resp_to_df(
|
||||
resp: SalesPrognosisResponse,
|
||||
) -> pd.DataFrame:
|
||||
if resp.error is not None:
|
||||
raise ValueError("Response contains error code. Parsing aborted.")
|
||||
data = resp.model_dump()["daten"]
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Input:
|
||||
# DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp
|
||||
@ -28,8 +43,22 @@ if TYPE_CHECKING:
|
||||
|
||||
# Prognose Umsatz je Firma
|
||||
|
||||
|
||||
# TODO: check usage of separate exception and handle it in API function
|
||||
# TODO set min number of data points as constant, not parameter
|
||||
def preprocess_sales_per_customer(
|
||||
resp: SalesPrognosisResponse,
|
||||
feature_map: Mapping[str, str],
|
||||
target_features: Set[str],
|
||||
) -> pd.DataFrame:
|
||||
df = parse_api_resp_to_df(resp)
|
||||
df = parse.preprocess_features(
|
||||
df,
|
||||
feature_map=feature_map,
|
||||
target_features=target_features,
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def sales_per_customer(
|
||||
|
||||
@ -9,9 +9,9 @@ if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def check_needed_features(
|
||||
def _check_needed_features(
|
||||
data: pd.DataFrame,
|
||||
features: Set,
|
||||
features: Set[str],
|
||||
) -> None:
|
||||
data_feats = set(data.columns)
|
||||
missing_features = features - data_feats
|
||||
@ -23,10 +23,11 @@ def check_needed_features(
|
||||
)
|
||||
|
||||
|
||||
def map_features_to_targets(
|
||||
def _map_features_to_targets(
|
||||
data: pd.DataFrame,
|
||||
feature_map: Mapping[str, str],
|
||||
) -> pd.DataFrame:
|
||||
data = data.copy()
|
||||
data_feats = data.columns
|
||||
mapped_feats: list[str] = []
|
||||
|
||||
@ -39,3 +40,14 @@ def map_features_to_targets(
|
||||
data.columns = mapped_feats
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def preprocess_features(
|
||||
data: pd.DataFrame,
|
||||
feature_map: Mapping[str, str],
|
||||
target_features: Set[str],
|
||||
) -> pd.DataFrame:
|
||||
data = _map_features_to_targets(data, feature_map)
|
||||
_check_needed_features(data, target_features)
|
||||
|
||||
return data
|
||||
|
||||
BIN
tests/_test_data/exmp_sales_prognosis_resp.pkl
Normal file
BIN
tests/_test_data/exmp_sales_prognosis_resp.pkl
Normal file
Binary file not shown.
@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from delta_barth.analysis import forecast as fc
|
||||
|
||||
|
||||
@ -15,3 +17,44 @@ def test_sales_per_customer_too_few_data_points(sales_data):
|
||||
|
||||
assert err == 1
|
||||
assert res is None
|
||||
|
||||
|
||||
def test_parse_api_resp_to_df(exmpl_api_sales_prognosis_resp):
|
||||
resp = exmpl_api_sales_prognosis_resp
|
||||
df = fc.parse_api_resp_to_df(resp)
|
||||
features = set(
|
||||
(
|
||||
"artikelId",
|
||||
"warengruppeId",
|
||||
"firmaId",
|
||||
"betrag",
|
||||
"menge",
|
||||
"buchungsDatum",
|
||||
)
|
||||
)
|
||||
assert all(col in features for col in df.columns)
|
||||
|
||||
|
||||
def test_preprocess_sales_per_customer(exmpl_api_sales_prognosis_resp):
|
||||
resp = exmpl_api_sales_prognosis_resp
|
||||
feat_mapping: dict[str, str] = {
|
||||
"artikelId": "artikel_refid",
|
||||
"firmaId": "firma_refid",
|
||||
"betrag": "betrag",
|
||||
"menge": "menge",
|
||||
"buchungsDatum": "buchungs_datum",
|
||||
}
|
||||
target_features: frozenset[str] = frozenset(
|
||||
(
|
||||
"firma_refid",
|
||||
"betrag",
|
||||
"buchungs_datum",
|
||||
)
|
||||
)
|
||||
df = fc.preprocess_sales_per_customer(
|
||||
resp,
|
||||
feature_map=feat_mapping,
|
||||
target_features=target_features,
|
||||
)
|
||||
assert len(df.columns) == 5
|
||||
assert any(feat not in df.columns for feat in feat_mapping.keys())
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from delta_barth.analysis import parse
|
||||
from delta_barth.analysis import forecast, parse
|
||||
from delta_barth.errors import FeaturesMissingError
|
||||
|
||||
|
||||
@ -10,12 +10,12 @@ def test_check_needed_features():
|
||||
data = pd.DataFrame(
|
||||
data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"]
|
||||
)
|
||||
parse.check_needed_features(data, target_features)
|
||||
parse._check_needed_features(data, target_features)
|
||||
data = pd.DataFrame(
|
||||
data=[[1, 2, 3, 4, 5]], columns=["featX", "feat2", "feat3", "feat4", "feat5"]
|
||||
)
|
||||
with pytest.raises(FeaturesMissingError):
|
||||
parse.check_needed_features(data, target_features)
|
||||
parse._check_needed_features(data, target_features)
|
||||
|
||||
|
||||
def test_map_features_to_targets():
|
||||
@ -23,7 +23,7 @@ def test_map_features_to_targets():
|
||||
data = pd.DataFrame(
|
||||
data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"]
|
||||
)
|
||||
data = parse.map_features_to_targets(data, feature_map)
|
||||
data = parse._map_features_to_targets(data, feature_map)
|
||||
assert "feat10" in data.columns
|
||||
assert "feat20" in data.columns
|
||||
assert "feat50" in data.columns
|
||||
@ -32,3 +32,28 @@ def test_map_features_to_targets():
|
||||
assert "feat1" not in data.columns
|
||||
assert "feat2" not in data.columns
|
||||
assert "feat5" not in data.columns
|
||||
|
||||
|
||||
def test_preprocess_features(exmpl_api_sales_prognosis_resp):
|
||||
resp = exmpl_api_sales_prognosis_resp
|
||||
df = forecast.parse_api_resp_to_df(resp)
|
||||
feat_mapping: dict[str, str] = {
|
||||
"artikelId": "artikel_refid",
|
||||
"firmaId": "firma_refid",
|
||||
"betrag": "betrag",
|
||||
"menge": "menge",
|
||||
"buchungsDatum": "buchungs_datum",
|
||||
}
|
||||
target_features: frozenset[str] = frozenset(
|
||||
(
|
||||
"firma_refid",
|
||||
"betrag",
|
||||
"buchungs_datum",
|
||||
)
|
||||
)
|
||||
|
||||
assert all(feat in df.columns for feat in feat_mapping.keys())
|
||||
data = parse.preprocess_features(df, feat_mapping, target_features)
|
||||
assert len(data.columns) == len(df.columns)
|
||||
assert (data.columns != df.columns).any()
|
||||
assert any(feat not in data.columns for feat in feat_mapping.keys())
|
||||
|
||||
@ -1,10 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pickle
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
from typing import Any, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from delta_barth.api.common import SalesPrognosisResponse
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def credentials() -> dict[str, str]:
|
||||
@ -56,3 +62,16 @@ def sales_data() -> pd.DataFrame:
|
||||
data["buchungs_datum"] = pd.to_datetime(data["buchungs_datum"])
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def exmpl_api_sales_prognosis_resp() -> SalesPrognosisResponse:
|
||||
pwd = Path.cwd()
|
||||
assert "barth" in pwd.parent.name.lower(), "not in project root directory"
|
||||
data_pth = pwd / "./tests/_test_data/exmp_sales_prognosis_resp.pkl"
|
||||
assert data_pth.exists(), "file to API sales data not found"
|
||||
|
||||
with open(data_pth, "rb") as file:
|
||||
data = cast("SalesPrognosisResponse", pickle.load(file))
|
||||
|
||||
return data
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user