333 lines
8.9 KiB
Python

from __future__ import annotations
import dataclasses as dc
from collections.abc import Mapping, Set
from datetime import datetime as Datetime
from typing import TYPE_CHECKING, Final
import pandas as pd
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from delta_barth.analysis import parse
from delta_barth.api.requests import (
SalesPrognosisResponse,
SalesPrognosisResults,
SalesPrognosisResultsExport,
get_sales_prognosis_data,
)
from delta_barth.constants import (
COL_MAP_SALES_PROGNOSIS,
FEATURES_SALES_PROGNOSIS,
MIN_NUMBER_DATAPOINTS,
)
from delta_barth.errors import STATUS_HANDLER, wrap_result
from delta_barth.types import (
DualDict,
PipeResult,
)
if TYPE_CHECKING:
from delta_barth.api.common import Session
from delta_barth.types import Status
def _parse_api_resp_to_df(
resp: SalesPrognosisResponse,
) -> pd.DataFrame:
"""n >= 2
Parameters
----------
resp : SalesPrognosisResponse
_description_
Returns
-------
pd.DataFrame
_description_
"""
data = resp.model_dump()["daten"]
return pd.DataFrame(data)
def _parse_df_to_api_resp(
data: pd.DataFrame,
) -> SalesPrognosisResponse:
df_formatted = data.to_dict(orient="records")
return SalesPrognosisResponse(daten=tuple(df_formatted)) # type: ignore
def _parse_df_to_results(
data: pd.DataFrame,
) -> SalesPrognosisResults:
df_formatted = data.to_dict(orient="records")
return SalesPrognosisResults(daten=tuple(df_formatted)) # type: ignore
@wrap_result()
def _parse_api_resp_to_df_wrapped(
resp: SalesPrognosisResponse,
) -> pd.DataFrame:
return _parse_api_resp_to_df(resp)
@wrap_result()
def _parse_df_to_api_resp_wrapped(
data: pd.DataFrame,
) -> SalesPrognosisResponse:
return _parse_df_to_api_resp(data)
@wrap_result()
def _parse_df_to_results_wrapped(
data: pd.DataFrame,
) -> SalesPrognosisResults:
return _parse_df_to_results(data)
# ------------------------------------------------------------------------------
# Input:
# DataFrame df mit Columns f_umsatz_fakt, firmen, art, v_warengrp
# kunde (muss enthalten sein in df['firmen']['firma_refid'])
# Output:
# Integer umsetzung (Prognose möglich): 0 ja, 1 nein (zu wenig Daten verfügbar),
# 2 nein (Daten nicht für Prognose geeignet)
# DataFrame test: Jahr, Monat, Vorhersage
# -------------------------------------------------------------------------------
# Prognose Umsatz je Firma
# TODO: check usage of separate exception and handle it in API function
# TODO set min number of data points as constant, not parameter
def _preprocess_sales(
resp: SalesPrognosisResponse,
feature_map: Mapping[str, str],
target_features: Set[str],
) -> PipeResult[SalesPrognosisResultsExport]:
"""n = 1
Parameters
----------
resp : SalesPrognosisResponse
_description_
feature_map : Mapping[str, str]
_description_
target_features : Set[str]
_description_
Returns
-------
PipeResult
_description_
"""
pipe: PipeResult[SalesPrognosisResultsExport] = PipeResult(None, STATUS_HANDLER.SUCCESS)
res = _parse_api_resp_to_df_wrapped(resp)
if res.status != STATUS_HANDLER.SUCCESS:
pipe.fail(res.status)
return pipe
df = res.result
res = parse.process_features_wrapped(
df,
feature_map=feature_map,
target_features=target_features,
)
if res.status != STATUS_HANDLER.SUCCESS:
pipe.fail(res.status)
return pipe
pipe.success(res.unwrap(), res.status)
return pipe
def _process_sales(
pipe: PipeResult[SalesPrognosisResultsExport],
min_num_data_points: int = 100,
) -> PipeResult[SalesPrognosisResultsExport]:
"""n = 1
Input-Data:
fields: ["artikel_refid", "firma_refid", "betrag", "menge", "buchungs_datum"]
- data already prefiltered if customer ID was provided
("firma_refid" same for all entries)
Parameters
----------
pipe : PipeResult
_description_
min_num_data_points : int, optional
_description_, by default 100
Returns
-------
PipeResult
_description_
"""
# cust_data: CustomerDataSalesForecast = CustomerDataSalesForecast()
# filter data
data = pipe.data
assert data is not None, "processing not existing pipe result"
data = data.copy()
# df_firma = data[
# (data["firma_refid"] == company_id) & (data["beleg_typ"] == 1) & (data["betrag"] > 0)
# ]
# for transaction in df_firma["vorgang_refid"].unique():
# cust_data.order.append(transaction)
# cust_data.date.append(
# df_firma[df_firma["vorgang_refid"] == transaction]["buchungs_datum"].iloc[0]
# )
# cust_data.sales.append(
# df_firma[df_firma["vorgang_refid"] == transaction]["betrag"].sum()
# )
# df_cust = pd.DataFrame(dc.asdict(cust_data))
# df_cust = df_cust.sort_values(by="date").reset_index()
DATE_FEAT: Final[str] = "buchungs_datum"
SALES_FEAT: Final[str] = "betrag"
df_firma = data[(data["betrag"] > 0)]
df_cust = df_firma.copy()
df_cust = df_cust.sort_values(by=DATE_FEAT).reset_index()
# check data availability
# TODO rework criteria
if len(df_cust) < min_num_data_points:
pipe.fail(STATUS_HANDLER.pipe_states.TOO_FEW_POINTS)
return pipe
# Entwicklung der Umsätze: definierte Zeiträume Monat
df_cust["jahr"] = df_cust[DATE_FEAT].dt.year
df_cust["monat"] = df_cust[DATE_FEAT].dt.month
monthly_sum = df_cust.groupby(["jahr", "monat"])[SALES_FEAT].sum().reset_index()
monthly_sum[DATE_FEAT] = (
monthly_sum["monat"].astype(str) + "." + monthly_sum["jahr"].astype(str)
)
monthly_sum[DATE_FEAT] = pd.to_datetime(monthly_sum[DATE_FEAT], format="%m.%Y")
monthly_sum = monthly_sum.set_index(DATE_FEAT)
train = monthly_sum.iloc[:-5].copy()
test = monthly_sum.iloc[-5:].copy()
features = ["jahr", "monat"]
target = SALES_FEAT
X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]
reg = XGBRegressor(
base_score=0.5,
booster="gbtree",
n_estimators=1000,
early_stopping_rounds=50,
objective="reg:squarederror",
max_depth=3,
learning_rate=0.01,
)
reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
test.loc[:, "vorhersage"] = reg.predict(X_test)
test = test.drop(SALES_FEAT, axis=1)
test = test.reset_index(drop=True)
pipe.success(test, STATUS_HANDLER.SUCCESS)
return pipe
def _postprocess_sales(
pipe: PipeResult[SalesPrognosisResultsExport],
feature_map: Mapping[str, str],
) -> PipeResult[SalesPrognosisResultsExport]:
data = pipe.data
assert data is not None, "processing not existing pipe result"
# convert features back to original naming
res = parse.process_features_wrapped(
data,
feature_map=feature_map,
target_features=set(),
)
if res.status != STATUS_HANDLER.SUCCESS:
pipe.fail(res.status)
return pipe
# res = _parse_df_to_api_resp_wrapped(res.result)
res = _parse_df_to_results_wrapped(res.unwrap())
if res.status != STATUS_HANDLER.SUCCESS:
pipe.fail(res.status)
return pipe
export_response = SalesPrognosisResultsExport(
response=res.unwrap(),
status=res.status,
)
pipe.export(export_response)
return pipe
def _export_on_fail(
status: Status,
) -> SalesPrognosisResultsExport:
response = SalesPrognosisResults(daten=tuple())
return SalesPrognosisResultsExport(response=response, status=status)
def pipeline_sales(
session: Session,
company_id: int | None = None,
start_date: Datetime | None = None,
) -> SalesPrognosisResultsExport:
response, status = get_sales_prognosis_data(
session,
company_id=company_id,
start_date=start_date,
)
if status != STATUS_HANDLER.SUCCESS:
return _export_on_fail(status)
pipe = _preprocess_sales(
response,
feature_map=COL_MAP_SALES_PROGNOSIS,
target_features=FEATURES_SALES_PROGNOSIS,
)
if pipe.status != STATUS_HANDLER.SUCCESS:
return _export_on_fail(pipe.status)
pipe = _process_sales(
pipe,
min_num_data_points=MIN_NUMBER_DATAPOINTS,
)
if pipe.status != STATUS_HANDLER.SUCCESS:
return _export_on_fail(pipe.status)
pipe = _postprocess_sales(
pipe,
feature_map=DualDict(),
)
if pipe.status != STATUS_HANDLER.SUCCESS:
return _export_on_fail(pipe.status)
assert pipe.results is not None, "needed export response not set in pipeline"
return pipe.results
def pipeline_sales_dummy(
session: Session,
company_id: int | None = None,
start_date: Datetime | None = None,
) -> SalesPrognosisResultsExport: # pragma: no cover
"""prototype dummy function for tests by DelBar"""
...