Add OrderHistory.make_real_time_time_series()
- the method slices out a real-time time series from the data within an `OrderHistory` object
This commit is contained in:
parent
5330ceb771
commit
100fac659a
3 changed files with 257 additions and 3 deletions
|
@ -294,3 +294,93 @@ class OrderHistory:
|
||||||
]
|
]
|
||||||
|
|
||||||
return training_df, frequency, actuals_df
|
return training_df, frequency, actuals_df
|
||||||
|
|
||||||
|
def make_real_time_time_series( # noqa:WPS210
|
||||||
|
self, pixel_id: int, predict_at: dt.datetime, train_horizon: int,
|
||||||
|
) -> Tuple[pd.DataFrame, int, int]:
|
||||||
|
"""Slice a vertical real-time time series out of the `.totals`.
|
||||||
|
|
||||||
|
Create a time series covering `train_horizon` weeks that can be used
|
||||||
|
for training a forecasting model to predict the demand at `predict_at`.
|
||||||
|
|
||||||
|
For explanation of the terms "horizontal", "vertical", and "real-time"
|
||||||
|
in the context of time series, see section 3.2 in the following paper:
|
||||||
|
https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pixel_id: pixel in which the time series is aggregated
|
||||||
|
predict_at: time step (i.e., "start_at") for which a prediction is made
|
||||||
|
train_horizon: weeks of historic data used to predict `predict_at`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
training time series, frequency, actual order count at `predict_at`
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
LookupError: `pixel_id` is not in the `grid`
|
||||||
|
RuntimeError: desired time series slice is not entirely in `.totals`
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
intra_pixel = self.totals.loc[pixel_id]
|
||||||
|
except KeyError:
|
||||||
|
raise LookupError('The `pixel_id` is not in the `grid`') from None
|
||||||
|
|
||||||
|
if predict_at >= config.CUTOFF_DAY: # pragma: no cover
|
||||||
|
raise RuntimeError('Internal error: cannot predict beyond the given data')
|
||||||
|
|
||||||
|
# The first and last training day are just before the `predict_at` day
|
||||||
|
# and span exactly `train_horizon` weeks covering all times of the day,
|
||||||
|
# including times on the `predict_at` day that are earlier than `predict_at`.
|
||||||
|
first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon)
|
||||||
|
first_start_at = dt.datetime(
|
||||||
|
first_train_day.year,
|
||||||
|
first_train_day.month,
|
||||||
|
first_train_day.day,
|
||||||
|
config.SERVICE_START,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
# Predicting the first time step on the `predict_at` day is a corner case.
|
||||||
|
# Then, the previous day is indeed the `last_train_day`. Predicting any
|
||||||
|
# other time step implies that the `predict_at` day is the `last_train_day`.
|
||||||
|
# `last_train_time` is the last "start_at" before the one being predicted.
|
||||||
|
if predict_at.hour == config.SERVICE_START:
|
||||||
|
last_train_day = predict_at.date() - dt.timedelta(days=1)
|
||||||
|
last_train_time = dt.time(config.SERVICE_END, 0)
|
||||||
|
else:
|
||||||
|
last_train_day = predict_at.date()
|
||||||
|
last_train_time = predict_at.time()
|
||||||
|
last_start_at = dt.datetime(
|
||||||
|
last_train_day.year,
|
||||||
|
last_train_day.month,
|
||||||
|
last_train_day.day,
|
||||||
|
last_train_time.hour,
|
||||||
|
last_train_time.minute,
|
||||||
|
) - dt.timedelta(minutes=self._time_step)
|
||||||
|
|
||||||
|
# The frequency is the number of weekdays times the number of daily time steps.
|
||||||
|
frequency = 7 * self._n_daily_time_steps
|
||||||
|
|
||||||
|
# Take all the counts between `first_train_day` and `last_train_day`,
|
||||||
|
# including the ones on the `predict_at` day prior to `predict_at`.
|
||||||
|
training_df = intra_pixel.loc[
|
||||||
|
first_start_at:last_start_at # type: ignore
|
||||||
|
]
|
||||||
|
n_time_steps_on_predict_day = (
|
||||||
|
(
|
||||||
|
predict_at
|
||||||
|
- dt.datetime(
|
||||||
|
predict_at.year,
|
||||||
|
predict_at.month,
|
||||||
|
predict_at.day,
|
||||||
|
config.SERVICE_START,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
).seconds
|
||||||
|
// 60 # -> minutes
|
||||||
|
// self._time_step
|
||||||
|
)
|
||||||
|
if len(training_df) != frequency * train_horizon + n_time_steps_on_predict_day:
|
||||||
|
raise RuntimeError('Not enough historic data for `predict_day`')
|
||||||
|
|
||||||
|
actual_df = intra_pixel.loc[[predict_at]]
|
||||||
|
|
||||||
|
return training_df, frequency, actual_df
|
||||||
|
|
|
@ -8,6 +8,9 @@ from urban_meal_delivery import config
|
||||||
# The day on which most test cases take place.
|
# The day on which most test cases take place.
|
||||||
YEAR, MONTH, DAY = 2016, 7, 1
|
YEAR, MONTH, DAY = 2016, 7, 1
|
||||||
|
|
||||||
|
# The hour when most test cases take place.
|
||||||
|
NOON = 12
|
||||||
|
|
||||||
# `START` and `END` constitute a 15-day time span.
|
# `START` and `END` constitute a 15-day time span.
|
||||||
# That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks.
|
# That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks.
|
||||||
START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0)
|
START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0)
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
"""Test the time series related code."""
|
"""Test the code generating time series with the order totals.
|
||||||
|
|
||||||
|
Unless otherwise noted, each `time_step` is 60 minutes long implying
|
||||||
|
12 time steps per day (i.e., we use `LONG_TIME_STEP` by default).
|
||||||
|
"""
|
||||||
# pylint:disable=no-self-use,unused-argument
|
# pylint:disable=no-self-use,unused-argument
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -63,7 +67,11 @@ def good_predict_at():
|
||||||
or `train_horizon=2` works.
|
or `train_horizon=2` works.
|
||||||
"""
|
"""
|
||||||
return datetime.datetime(
|
return datetime.datetime(
|
||||||
test_config.END.year, test_config.END.month, test_config.END.day, 12, 0,
|
test_config.END.year,
|
||||||
|
test_config.END.month,
|
||||||
|
test_config.END.day,
|
||||||
|
test_config.NOON,
|
||||||
|
0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -76,7 +84,7 @@ def bad_predict_at():
|
||||||
"""
|
"""
|
||||||
predict_day = test_config.END - datetime.timedelta(weeks=1, days=1)
|
predict_day = test_config.END - datetime.timedelta(weeks=1, days=1)
|
||||||
return datetime.datetime(
|
return datetime.datetime(
|
||||||
predict_day.year, predict_day.month, predict_day.day, 12, 0,
|
predict_day.year, predict_day.month, predict_day.day, test_config.NOON, 0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -282,3 +290,156 @@ class TestMakeVerticalTimeSeries:
|
||||||
predict_day=good_predict_at.date(),
|
predict_day=good_predict_at.date(),
|
||||||
train_horizon=999,
|
train_horizon=999,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMakeRealTimeTimeSeries:
|
||||||
|
"""Test the `OrderHistory.make_real_time_time_series()` method."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_wrong_pixel(self, order_history, good_predict_at, train_horizon):
|
||||||
|
"""A `pixel_id` that is not in the `grid`."""
|
||||||
|
with pytest.raises(LookupError):
|
||||||
|
order_history.make_real_time_time_series(
|
||||||
|
pixel_id=999_999,
|
||||||
|
predict_at=good_predict_at,
|
||||||
|
train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_time_series_are_dataframes(
|
||||||
|
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||||
|
):
|
||||||
|
"""The time series come in a one-column `pd.DataFrame`."""
|
||||||
|
result = order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id,
|
||||||
|
predict_at=good_predict_at,
|
||||||
|
train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
training_df, _, actual_df = result
|
||||||
|
|
||||||
|
assert isinstance(training_df, pd.DataFrame)
|
||||||
|
assert training_df.columns == ['total_orders']
|
||||||
|
assert isinstance(actual_df, pd.DataFrame)
|
||||||
|
assert actual_df.columns == ['total_orders']
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_time_series_have_correct_length1(
|
||||||
|
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||||
|
):
|
||||||
|
"""The length of a training time series is the product of the ...
|
||||||
|
|
||||||
|
... weekly time steps (i.e., product of `7` and the number of daily time steps)
|
||||||
|
and the `train_horizon` in weeks; however, this assertion only holds if
|
||||||
|
we predict the first `time_step` of the day.
|
||||||
|
|
||||||
|
The time series with the actual order counts always holds `1` value.
|
||||||
|
"""
|
||||||
|
predict_at = datetime.datetime(
|
||||||
|
good_predict_at.year,
|
||||||
|
good_predict_at.month,
|
||||||
|
good_predict_at.day,
|
||||||
|
config.SERVICE_START,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
result = order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id, predict_at=predict_at, train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
training_df, _, actual_df = result
|
||||||
|
|
||||||
|
n_daily_time_steps = (
|
||||||
|
60
|
||||||
|
* (config.SERVICE_END - config.SERVICE_START)
|
||||||
|
// test_config.LONG_TIME_STEP
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(training_df) == 7 * n_daily_time_steps * train_horizon
|
||||||
|
assert len(actual_df) == 1
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_time_series_have_correct_length2(
|
||||||
|
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||||
|
):
|
||||||
|
"""The length of a training time series is the product of the ...
|
||||||
|
|
||||||
|
... weekly time steps (i.e., product of `7` and the number of daily time steps)
|
||||||
|
and the `train_horizon` in weeks; however, this assertion only holds if
|
||||||
|
we predict the first `time_step` of the day. Predicting any other `time_step`
|
||||||
|
means that the training time series becomes longer by the number of time steps
|
||||||
|
before the one being predicted.
|
||||||
|
|
||||||
|
The time series with the actual order counts always holds `1` value.
|
||||||
|
"""
|
||||||
|
assert good_predict_at.hour == test_config.NOON
|
||||||
|
|
||||||
|
result = order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id,
|
||||||
|
predict_at=good_predict_at,
|
||||||
|
train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
training_df, _, actual_df = result
|
||||||
|
|
||||||
|
n_daily_time_steps = (
|
||||||
|
60
|
||||||
|
* (config.SERVICE_END - config.SERVICE_START)
|
||||||
|
// test_config.LONG_TIME_STEP
|
||||||
|
)
|
||||||
|
n_time_steps_before = (
|
||||||
|
60 * (test_config.NOON - config.SERVICE_START) // test_config.LONG_TIME_STEP
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
len(training_df)
|
||||||
|
== 7 * n_daily_time_steps * train_horizon + n_time_steps_before
|
||||||
|
)
|
||||||
|
assert len(actual_df) == 1
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_frequency_is_number_number_of_weekly_time_steps(
|
||||||
|
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||||
|
):
|
||||||
|
"""The `frequency` is the number of weekly time steps."""
|
||||||
|
result = order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id,
|
||||||
|
predict_at=good_predict_at,
|
||||||
|
train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
_, frequency, _ = result # noqa:WPS434
|
||||||
|
|
||||||
|
n_daily_time_steps = (
|
||||||
|
60
|
||||||
|
* (config.SERVICE_END - config.SERVICE_START)
|
||||||
|
// test_config.LONG_TIME_STEP
|
||||||
|
)
|
||||||
|
|
||||||
|
assert frequency == 7 * n_daily_time_steps
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||||
|
def test_no_long_enough_history1(
|
||||||
|
self, order_history, good_pixel_id, bad_predict_at, train_horizon,
|
||||||
|
):
|
||||||
|
"""If the `predict_at` day is too early in the `START`-`END` horizon ...
|
||||||
|
|
||||||
|
... the history of order totals is not long enough.
|
||||||
|
"""
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id,
|
||||||
|
predict_at=bad_predict_at,
|
||||||
|
train_horizon=train_horizon,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_no_long_enough_history2(
|
||||||
|
self, order_history, good_pixel_id, good_predict_at,
|
||||||
|
):
|
||||||
|
"""If the `train_horizon` is longer than the `START`-`END` horizon ...
|
||||||
|
|
||||||
|
... the history of order totals can never be long enough.
|
||||||
|
"""
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
order_history.make_real_time_time_series(
|
||||||
|
pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999,
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in a new issue