diff --git a/src/delta_barth/analysis/forecast.py b/src/delta_barth/analysis/forecast.py index 8d18bc6..15fb198 100644 --- a/src/delta_barth/analysis/forecast.py +++ b/src/delta_barth/analysis/forecast.py @@ -1,19 +1,34 @@ from __future__ import annotations import dataclasses as dc +from collections.abc import Mapping, Set from typing import TYPE_CHECKING import pandas as pd from sklearn.metrics import mean_squared_error from xgboost import XGBRegressor +from delta_barth.analysis import parse +from delta_barth.constants import COL_MAP_SALES_PROGNOSIS, FEATURES_SALES_PROGNOSIS from delta_barth.types import CustomerDataSalesForecast, FcErrorCodes if TYPE_CHECKING: + from delta_barth.api.common import SalesPrognosisResponse from delta_barth.types import FcResult # TODO check pandera for DataFrame validation + +def parse_api_resp_to_df( + resp: SalesPrognosisResponse, +) -> pd.DataFrame: + if resp.error is not None: + raise ValueError("Response contains error code. Parsing aborted.") + data = resp.model_dump()["daten"] + + return pd.DataFrame(data) + + # ------------------------------------------------------------------------------ # Input: # DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp @@ -28,8 +43,22 @@ if TYPE_CHECKING: # Prognose Umsatz je Firma + # TODO: check usage of separate exception and handle it in API function # TODO set min number of data points as constant, not parameter +def preprocess_sales_per_customer( + resp: SalesPrognosisResponse, + feature_map: Mapping[str, str], + target_features: Set[str], +) -> pd.DataFrame: + df = parse_api_resp_to_df(resp) + df = parse.preprocess_features( + df, + feature_map=feature_map, + target_features=target_features, + ) + + return df def sales_per_customer( diff --git a/src/delta_barth/analysis/parse.py b/src/delta_barth/analysis/parse.py index 7693d92..bec6308 100644 --- a/src/delta_barth/analysis/parse.py +++ b/src/delta_barth/analysis/parse.py @@ -9,9 +9,9 @@ if TYPE_CHECKING: import pandas as pd -def check_needed_features( +def _check_needed_features( data: pd.DataFrame, - features: Set, + features: Set[str], ) -> None: data_feats = set(data.columns) missing_features = features - data_feats @@ -23,10 +23,11 @@ def check_needed_features( ) -def map_features_to_targets( +def _map_features_to_targets( data: pd.DataFrame, feature_map: Mapping[str, str], ) -> pd.DataFrame: + data = data.copy() data_feats = data.columns mapped_feats: list[str] = [] @@ -39,3 +40,14 @@ def map_features_to_targets( data.columns = mapped_feats return data + + +def preprocess_features( + data: pd.DataFrame, + feature_map: Mapping[str, str], + target_features: Set[str], +) -> pd.DataFrame: + data = _map_features_to_targets(data, feature_map) + _check_needed_features(data, target_features) + + return data diff --git a/tests/_test_data/exmp_sales_prognosis_resp.pkl b/tests/_test_data/exmp_sales_prognosis_resp.pkl new file mode 100644 index 0000000..da20733 Binary files /dev/null and b/tests/_test_data/exmp_sales_prognosis_resp.pkl differ diff --git a/tests/analysis/test_forecast.py b/tests/analysis/test_forecast.py index 852fd1b..7d558ae 100644 --- a/tests/analysis/test_forecast.py +++ b/tests/analysis/test_forecast.py @@ -1,3 +1,5 @@ +import pytest + from delta_barth.analysis import forecast as fc @@ -15,3 +17,44 @@ def test_sales_per_customer_too_few_data_points(sales_data): assert err == 1 assert res is None + + +def test_parse_api_resp_to_df(exmpl_api_sales_prognosis_resp): + resp = exmpl_api_sales_prognosis_resp + df = fc.parse_api_resp_to_df(resp) + features = set( + ( + "artikelId", + "warengruppeId", + "firmaId", + "betrag", + "menge", + "buchungsDatum", + ) + ) + assert all(col in features for col in df.columns) + + +def test_preprocess_sales_per_customer(exmpl_api_sales_prognosis_resp): + resp = exmpl_api_sales_prognosis_resp + feat_mapping: dict[str, str] = { + "artikelId": "artikel_refid", + "firmaId": "firma_refid", + "betrag": "betrag", + "menge": "menge", + "buchungsDatum": "buchungs_datum", + } + target_features: frozenset[str] = frozenset( + ( + "firma_refid", + "betrag", + "buchungs_datum", + ) + ) + df = fc.preprocess_sales_per_customer( + resp, + feature_map=feat_mapping, + target_features=target_features, + ) + assert len(df.columns) == 5 + assert any(feat not in df.columns for feat in feat_mapping.keys()) diff --git a/tests/analysis/test_parse.py b/tests/analysis/test_parse.py index ad92e19..0047f8e 100644 --- a/tests/analysis/test_parse.py +++ b/tests/analysis/test_parse.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from delta_barth.analysis import parse +from delta_barth.analysis import forecast, parse from delta_barth.errors import FeaturesMissingError @@ -10,12 +10,12 @@ def test_check_needed_features(): data = pd.DataFrame( data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"] ) - parse.check_needed_features(data, target_features) + parse._check_needed_features(data, target_features) data = pd.DataFrame( data=[[1, 2, 3, 4, 5]], columns=["featX", "feat2", "feat3", "feat4", "feat5"] ) with pytest.raises(FeaturesMissingError): - parse.check_needed_features(data, target_features) + parse._check_needed_features(data, target_features) def test_map_features_to_targets(): @@ -23,7 +23,7 @@ def test_map_features_to_targets(): data = pd.DataFrame( data=[[1, 2, 3, 4, 5]], columns=["feat1", "feat2", "feat3", "feat4", "feat5"] ) - data = parse.map_features_to_targets(data, feature_map) + data = parse._map_features_to_targets(data, feature_map) assert "feat10" in data.columns assert "feat20" in data.columns assert "feat50" in data.columns @@ -32,3 +32,28 @@ def test_map_features_to_targets(): assert "feat1" not in data.columns assert "feat2" not in data.columns assert "feat5" not in data.columns + + +def test_preprocess_features(exmpl_api_sales_prognosis_resp): + resp = exmpl_api_sales_prognosis_resp + df = forecast.parse_api_resp_to_df(resp) + feat_mapping: dict[str, str] = { + "artikelId": "artikel_refid", + "firmaId": "firma_refid", + "betrag": "betrag", + "menge": "menge", + "buchungsDatum": "buchungs_datum", + } + target_features: frozenset[str] = frozenset( + ( + "firma_refid", + "betrag", + "buchungs_datum", + ) + ) + + assert all(feat in df.columns for feat in feat_mapping.keys()) + data = parse.preprocess_features(df, feat_mapping, target_features) + assert len(data.columns) == len(df.columns) + assert (data.columns != df.columns).any() + assert any(feat not in data.columns for feat in feat_mapping.keys()) diff --git a/tests/conftest.py b/tests/conftest.py index dd2ed2d..a3db0ad 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,16 @@ +from __future__ import annotations + +import pickle import tomllib from pathlib import Path -from typing import Any, cast +from typing import TYPE_CHECKING, Any, cast import pandas as pd import pytest +if TYPE_CHECKING: + from delta_barth.api.common import SalesPrognosisResponse + @pytest.fixture(scope="session") def credentials() -> dict[str, str]: @@ -56,3 +62,16 @@ def sales_data() -> pd.DataFrame: data["buchungs_datum"] = pd.to_datetime(data["buchungs_datum"]) return data + + +@pytest.fixture(scope="session") +def exmpl_api_sales_prognosis_resp() -> SalesPrognosisResponse: + pwd = Path.cwd() + assert "barth" in pwd.parent.name.lower(), "not in project root directory" + data_pth = pwd / "./tests/_test_data/exmp_sales_prognosis_resp.pkl" + assert data_pth.exists(), "file to API sales data not found" + + with open(data_pth, "rb") as file: + data = cast("SalesPrognosisResponse", pickle.load(file)) + + return data