major overhaul of forecast pipeline #21

Merged
foefl merged 15 commits from prediction_to_future into main 2025-04-16 09:24:34 +00:00
3 changed files with 42 additions and 23 deletions
Showing only changes of commit afa31e2a94 - Show all commits

View File

@ -1,6 +1,6 @@
[project] [project]
name = "delta-barth" name = "delta-barth"
version = "0.5.7dev1" version = "0.5.7dev2"
description = "workflows and pipelines for the Python-based Plugin of Delta Barth's ERP system" description = "workflows and pipelines for the Python-based Plugin of Delta Barth's ERP system"
authors = [ authors = [
{name = "Florian Förster", email = "f.foerster@d-opt.com"}, {name = "Florian Förster", email = "f.foerster@d-opt.com"},
@ -44,7 +44,8 @@ filterwarnings = [
] ]
markers = [ markers = [
"api_con_required: tests require an API connection (deselect with '-m \"not api_con_required\"')", "api_con_required: tests require an API connection (deselect with '-m \"not api_con_required\"')",
"new: to test only new tests, usually removed afterwards (deselect with '-m \"not quick\"')", "new: to test only new tests, usually removed afterwards (deselect with '-m \"not new\"')",
"forecast: main components of forecast pipeline (deselect with '-m \"not forecast\"')"
] ]
log_cli = true log_cli = true
@ -73,7 +74,7 @@ directory = "reports/coverage"
[tool.bumpversion] [tool.bumpversion]
current_version = "0.5.7dev1" current_version = "0.5.7dev2"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@ -208,17 +208,25 @@ def _process_sales(
df_cust["jahr"] = df_cust[DATE_FEAT].dt.year df_cust["jahr"] = df_cust[DATE_FEAT].dt.year
df_cust["monat"] = df_cust[DATE_FEAT].dt.month df_cust["monat"] = df_cust[DATE_FEAT].dt.month
current_year = datetime.now().year monthly_sum_data_only = df_cust.groupby(["jahr", "monat"])[SALES_FEAT].sum().reset_index()
current_month = datetime.now().month
current_year = datetime.datetime.now().year
current_month = datetime.datetime.now().month
years = range(df_cust["jahr"].min(), current_year + 1) years = range(df_cust["jahr"].min(), current_year + 1)
old_monthly_sum = df_cust.groupby(["jahr", "monat"])[SALES_FEAT].sum().reset_index()
all_month_year_combinations = pd.DataFrame( all_month_year_combinations = pd.DataFrame(
[(year, month) for year in years for month in range(1, 13) if (year < current_year or (year == current_year and month <= current_month))], columns=["jahr", "monat"] [
(year, month)
for year in years
for month in range(1, 13)
if (year < current_year or (year == current_year and month <= current_month))
],
columns=["jahr", "monat"],
) )
monthly_sum = pd.merge(all_month_year_combinations, old_monthly_sum, on=["jahr", "monat"], how="left") monthly_sum = pd.merge(
all_month_year_combinations, monthly_sum_data_only, on=["jahr", "monat"], how="left"
)
monthly_sum[SALES_FEAT] = monthly_sum[SALES_FEAT].fillna(0) monthly_sum[SALES_FEAT] = monthly_sum[SALES_FEAT].fillna(0)
monthly_sum[DATE_FEAT] = ( monthly_sum[DATE_FEAT] = (
monthly_sum["monat"].astype(str) + "." + monthly_sum["jahr"].astype(str) monthly_sum["monat"].astype(str) + "." + monthly_sum["jahr"].astype(str)
@ -256,27 +264,22 @@ def _process_sales(
too_few_month_points: bool = True too_few_month_points: bool = True
dates = cast(pd.DatetimeIndex, monthly_sum.index) dates = cast(pd.DatetimeIndex, monthly_sum.index)
# print("dates: ", dates)
# baseline: 3 years - 36 months # baseline: 3 years - 36 months
starting_date = datetime.datetime.now() - relativedelta(months=36) starting_date = datetime.datetime.now() - relativedelta(months=36)
target_index, succ = next( target_index, _ = next(
((i, True) for i, date in enumerate(dates) if date >= starting_date), (len(dates) - 1, False) ((i, True) for i, date in enumerate(dates) if date >= starting_date),
(len(dates) - 1, False),
) )
# print("start idx: ", target_index, "length dates: ", len(dates))
for add_year, date_idx in enumerate(range(start_index, -1, -12)): for add_year, date_idx in enumerate(range(target_index, -1, -12)):
# print("date_idx: ", date_idx)
first_date = dates[date_idx] first_date = dates[date_idx]
# print("first date: ", first_date)
split_date = dates[-6] split_date = dates[-6]
train = cast( train = cast(
pd.DataFrame, pd.DataFrame,
monthly_sum.loc[first_date:split_date].copy(), # type: ignore monthly_sum.loc[first_date:split_date].copy(), # type: ignore
) )
# print(train)
# print("Length train: ", len(train))
test = cast( test = cast(
pd.DataFrame, pd.DataFrame,
monthly_sum.loc[split_date:].copy(), # type: ignore monthly_sum.loc[split_date:].copy(), # type: ignore
@ -286,7 +289,7 @@ def _process_sales(
# test set size fixed at 6 --> first iteration: baseline - 6 entries # test set size fixed at 6 --> first iteration: baseline - 6 entries
# for each new year 10 new data points (i.e., sales strictly positive) needed # for each new year 10 new data points (i.e., sales strictly positive) needed
if len(train[train[SALES_FEAT] > 0]) >= 30 + 10 * add_year: if len(train[train[SALES_FEAT] > 0]) >= (base_num_data_points_months + 10 * add_year):
too_few_month_points = False too_few_month_points = False
rand = RandomizedSearchCV( rand = RandomizedSearchCV(
@ -314,7 +317,6 @@ def _process_sales(
# --- new: store best_estimator # --- new: store best_estimator
best_estimator = copy.copy(rand.best_estimator_) best_estimator = copy.copy(rand.best_estimator_)
# ?? --- new: use best_estimator to calculate future values and store them in forecast
if best_estimator is not None: if best_estimator is not None:
X_future = pd.DataFrame( X_future = pd.DataFrame(
{"jahr": future_dates.year, "monat": future_dates.month}, index=future_dates {"jahr": future_dates.year, "monat": future_dates.month}, index=future_dates

View File

@ -1,4 +1,6 @@
import datetime
from datetime import datetime as Datetime from datetime import datetime as Datetime
from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
import numpy as np import numpy as np
@ -255,6 +257,7 @@ def test_preprocess_sales_FailOnTargetFeature(
assert pipe.results is None assert pipe.results is None
@pytest.mark.forecast
def test_process_sales_Success(sales_data_real_preproc): def test_process_sales_Success(sales_data_real_preproc):
data = sales_data_real_preproc.copy() data = sales_data_real_preproc.copy()
pipe = PipeResult(data, STATUS_HANDLER.SUCCESS) pipe = PipeResult(data, STATUS_HANDLER.SUCCESS)
@ -277,6 +280,7 @@ def test_process_sales_Success(sales_data_real_preproc):
assert pipe.statistics.xgb_params is not None assert pipe.statistics.xgb_params is not None
@pytest.mark.forecast
def test_process_sales_FailTooFewPoints(sales_data_real_preproc): def test_process_sales_FailTooFewPoints(sales_data_real_preproc):
data = sales_data_real_preproc.copy() data = sales_data_real_preproc.copy()
data = data.iloc[:20, :] data = data.iloc[:20, :]
@ -303,6 +307,7 @@ def test_process_sales_FailTooFewPoints(sales_data_real_preproc):
assert pipe.statistics.xgb_params is None assert pipe.statistics.xgb_params is None
@pytest.mark.forecast
def test_process_sales_FailTooFewMonthPoints(sales_data_real_preproc): def test_process_sales_FailTooFewMonthPoints(sales_data_real_preproc):
data = sales_data_real_preproc.copy() data = sales_data_real_preproc.copy()
pipe = PipeResult(data, STATUS_HANDLER.SUCCESS) pipe = PipeResult(data, STATUS_HANDLER.SUCCESS)
@ -329,8 +334,19 @@ def test_process_sales_FailTooFewMonthPoints(sales_data_real_preproc):
assert pipe.statistics.xgb_params is None assert pipe.statistics.xgb_params is None
@pytest.mark.forecast
def test_process_sales_FailNoReliableForecast(sales_data_real_preproc): def test_process_sales_FailNoReliableForecast(sales_data_real_preproc):
data = sales_data_real_preproc.copy() # prepare fake data
df = sales_data_real_preproc.copy()
f_dates = "buchungs_datum"
end = datetime.datetime.now()
start = df[f_dates].max()
fake_dates = pd.date_range(start, end, freq="MS")
fake_data = [(1234, 1014, 1024, 1000, 10, date) for date in fake_dates]
fake_df = pd.DataFrame(fake_data, columns=df.columns)
enhanced_df = pd.concat((df, fake_df), ignore_index=True)
data = enhanced_df.copy()
data["betrag"] = 10000 data["betrag"] = 10000
print(data["betrag"]) print(data["betrag"])
data = data.iloc[:20000, :] data = data.iloc[:20000, :]
@ -340,7 +356,7 @@ def test_process_sales_FailNoReliableForecast(sales_data_real_preproc):
def __init__(self, *args, **kwargs) -> None: def __init__(self, *args, **kwargs) -> None:
class Predictor: class Predictor:
def predict(self, *args, **kwargs): def predict(self, *args, **kwargs):
return np.array([1, 1, 1, 1]) return np.array([1, 1, 1, 1], dtype=np.float64)
self.best_estimator_ = Predictor() self.best_estimator_ = Predictor()
@ -354,7 +370,7 @@ def test_process_sales_FailNoReliableForecast(sales_data_real_preproc):
pipe = fc._process_sales( pipe = fc._process_sales(
pipe, pipe,
min_num_data_points=1, min_num_data_points=1,
base_num_data_points_months=-100, base_num_data_points_months=1,
) )
assert pipe.status != STATUS_HANDLER.SUCCESS assert pipe.status != STATUS_HANDLER.SUCCESS