idea of timedelta based algorithm
This commit is contained in:
parent
5d1f5199d3
commit
e1b375396a
@ -8,6 +8,7 @@ from dataclasses import asdict
|
|||||||
from datetime import datetime as Datetime
|
from datetime import datetime as Datetime
|
||||||
from typing import TYPE_CHECKING, Final, TypeAlias, cast
|
from typing import TYPE_CHECKING, Final, TypeAlias, cast
|
||||||
|
|
||||||
|
import dopt_basics.datetime
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import scipy.stats
|
import scipy.stats
|
||||||
@ -15,6 +16,7 @@ import sqlalchemy as sql
|
|||||||
|
|
||||||
# --- new: for calculating timedelta
|
# --- new: for calculating timedelta
|
||||||
from dateutil.relativedelta import relativedelta
|
from dateutil.relativedelta import relativedelta
|
||||||
|
from dopt_basics.datetime import TimeUnitsTimedelta
|
||||||
from sklearn.metrics import mean_absolute_error, r2_score
|
from sklearn.metrics import mean_absolute_error, r2_score
|
||||||
from sklearn.model_selection import KFold, RandomizedSearchCV
|
from sklearn.model_selection import KFold, RandomizedSearchCV
|
||||||
from xgboost import XGBRegressor
|
from xgboost import XGBRegressor
|
||||||
@ -251,26 +253,57 @@ def _process_sales(
|
|||||||
# TODO: therefore, stepping with fixed value n does not result in timedelta of n episodes
|
# TODO: therefore, stepping with fixed value n does not result in timedelta of n episodes
|
||||||
# Option A: pad data frame with zero values --> could impede forecast algorithm
|
# Option A: pad data frame with zero values --> could impede forecast algorithm
|
||||||
# Option B: calculate next index based on timedelta
|
# Option B: calculate next index based on timedelta
|
||||||
dates = monthly_sum.index
|
stride = dopt_basics.datetime.timedelta_from_val(365, TimeUnitsTimedelta.DAYS)
|
||||||
|
|
||||||
|
dates = cast(pd.DatetimeIndex, monthly_sum.index)
|
||||||
|
|
||||||
|
min_date = dates.min()
|
||||||
|
|
||||||
# print("dates: ", dates)
|
# print("dates: ", dates)
|
||||||
# ?? --- new: use monthly basis for time windows
|
# ?? --- new: use monthly basis for time windows
|
||||||
# baseline: 3 years - 36 months
|
# baseline: 3 years - 36 months
|
||||||
starting_date = datetime.datetime.now() - relativedelta(months=12)
|
starting_date = datetime.datetime.now() - relativedelta(months=12)
|
||||||
# starting_date = dates.max() - relativedelta(months=36)
|
# starting_date = dates.max() - relativedelta(months=36)
|
||||||
start_index = next(
|
# start_index = next(
|
||||||
(i for i, date in enumerate(dates) if date >= starting_date), len(dates) - 1
|
# (i for i, date in enumerate(dates) if date >= starting_date), len(dates) - 1
|
||||||
)
|
# )
|
||||||
print("start idx: ", start_index, "length dates: ", len(dates))
|
# print("start idx: ", start_index, "length dates: ", len(dates))
|
||||||
|
|
||||||
for add_year, date_idx in enumerate(range(start_index, -1, -12)):
|
def get_index_date(
|
||||||
print("date_idx: ", date_idx)
|
dates: pd.DatetimeIndex,
|
||||||
first_date = dates[date_idx]
|
starting_date: datetime.datetime | pd.Timestamp,
|
||||||
print("first date: ", first_date)
|
) -> tuple[pd.Timestamp, bool]:
|
||||||
|
target, succ = next(
|
||||||
|
((date, True) for date in dates if date >= starting_date), (dates[-1], False)
|
||||||
|
)
|
||||||
|
return target, succ
|
||||||
|
|
||||||
|
first_date, succ = get_index_date(dates, starting_date)
|
||||||
|
if not succ:
|
||||||
|
# !! return early
|
||||||
|
...
|
||||||
|
|
||||||
|
date_span = first_date - min_date
|
||||||
|
steps = date_span.days // stride.days
|
||||||
|
|
||||||
|
for step in range(steps + 1):
|
||||||
|
print("step: ", step)
|
||||||
|
target_date = first_date - step * stride
|
||||||
|
print("target date: ", target_date)
|
||||||
split_date = dates[-6]
|
split_date = dates[-6]
|
||||||
|
|
||||||
|
index_date, succ = get_index_date(dates, target_date)
|
||||||
|
|
||||||
|
if not succ:
|
||||||
|
break
|
||||||
|
|
||||||
|
if index_date >= split_date:
|
||||||
|
print("Skip because of date difference")
|
||||||
|
continue
|
||||||
|
|
||||||
train = cast(
|
train = cast(
|
||||||
pd.DataFrame,
|
pd.DataFrame,
|
||||||
monthly_sum.loc[first_date:split_date].copy(), # type: ignore
|
monthly_sum.loc[index_date:split_date].copy(), # type: ignore
|
||||||
)
|
)
|
||||||
print(train)
|
print(train)
|
||||||
print("Length train: ", len(train))
|
print("Length train: ", len(train))
|
||||||
@ -284,7 +317,7 @@ def _process_sales(
|
|||||||
# ?? --- new: adapted condition to fit new for-loop
|
# ?? --- new: adapted condition to fit new for-loop
|
||||||
# test set size fixed at 6 --> first iteration: baseline - 6 entries
|
# test set size fixed at 6 --> first iteration: baseline - 6 entries
|
||||||
# for each new year 10 new data points needed
|
# for each new year 10 new data points needed
|
||||||
if len(train) >= 30 + 10 * add_year:
|
if len(train) >= 30 + 10 * step:
|
||||||
too_few_month_points = False
|
too_few_month_points = False
|
||||||
|
|
||||||
rand = RandomizedSearchCV(
|
rand = RandomizedSearchCV(
|
||||||
@ -308,7 +341,7 @@ def _process_sales(
|
|||||||
best_score_mae = error
|
best_score_mae = error
|
||||||
best_score_r2 = cast(float, r2_score(y_test, y_pred))
|
best_score_r2 = cast(float, r2_score(y_test, y_pred))
|
||||||
# --- new: use first_date for best_start_year
|
# --- new: use first_date for best_start_year
|
||||||
best_start_year = first_date.year
|
best_start_year = target_date.year
|
||||||
# --- new: store best_estimator
|
# --- new: store best_estimator
|
||||||
best_estimator = copy.copy(rand.best_estimator_)
|
best_estimator = copy.copy(rand.best_estimator_)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user