Add OrderHistory.make_horizontal_time_series()
- the method slices out a horizontal time series from the data within an `OrderHistory` object
This commit is contained in:
parent
65d1632e98
commit
b61db734b6
3 changed files with 258 additions and 0 deletions
|
@ -1,6 +1,7 @@
|
|||
"""Obtain and work with time series data."""
|
||||
|
||||
import datetime as dt
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
@ -145,3 +146,69 @@ class OrderHistory:
|
|||
index.names = ['pixel_id', 'start_at']
|
||||
|
||||
return data.reindex(index, fill_value=0)
|
||||
|
||||
def make_horizontal_time_series( # noqa:WPS210
|
||||
self, pixel_id: int, predict_at: dt.datetime, train_horizon: int,
|
||||
) -> Tuple[pd.DataFrame, int, int]:
|
||||
"""Slice a horizontal time series out of the `.totals`.
|
||||
|
||||
Create a time series covering `train_horizon` weeks that can be used
|
||||
for training a forecasting model to predict the demand at `predict_at`.
|
||||
|
||||
For explanation of the terms "horizontal", "vertical", and "real-time"
|
||||
in the context of time series, see section 3.2 in the following paper:
|
||||
https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
|
||||
|
||||
Args:
|
||||
pixel_id: pixel in which the time series is aggregated
|
||||
predict_at: time step (i.e., "start_at") for which a prediction is made
|
||||
train_horizon: weeks of historic data used to predict `predict_at`
|
||||
|
||||
Returns:
|
||||
training time series, frequency, actual order count at `predict_at`
|
||||
|
||||
Raises:
|
||||
LookupError: `pixel_id` is not in the `grid`
|
||||
RuntimeError: desired time series slice is not entirely in `.totals`
|
||||
"""
|
||||
try:
|
||||
intra_pixel = self.totals.loc[pixel_id]
|
||||
except KeyError:
|
||||
raise LookupError('The `pixel_id` is not in the `grid`') from None
|
||||
|
||||
if predict_at >= config.CUTOFF_DAY: # pragma: no cover
|
||||
raise RuntimeError('Internal error: cannot predict beyond the given data')
|
||||
|
||||
# The first and last training day are just before the `predict_at` day
|
||||
# and span exactly `train_horizon` weeks covering only the times of the
|
||||
# day equal to the hour/minute of `predict_at`.
|
||||
first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon)
|
||||
first_start_at = dt.datetime(
|
||||
first_train_day.year,
|
||||
first_train_day.month,
|
||||
first_train_day.day,
|
||||
predict_at.hour,
|
||||
predict_at.minute,
|
||||
)
|
||||
last_train_day = predict_at.date() - dt.timedelta(days=1)
|
||||
last_start_at = dt.datetime(
|
||||
last_train_day.year,
|
||||
last_train_day.month,
|
||||
last_train_day.day,
|
||||
predict_at.hour,
|
||||
predict_at.minute,
|
||||
)
|
||||
|
||||
# The frequency is the number of weekdays.
|
||||
frequency = 7
|
||||
|
||||
# Take only the counts at the `predict_at` time.
|
||||
training_df = intra_pixel.loc[
|
||||
first_start_at : last_start_at : self._n_daily_time_steps # type: ignore
|
||||
]
|
||||
if len(training_df) != frequency * train_horizon:
|
||||
raise RuntimeError('Not enough historic data for `predict_at`')
|
||||
|
||||
actual_df = intra_pixel.loc[[predict_at]]
|
||||
|
||||
return training_df, frequency, actual_df
|
||||
|
|
|
@ -1,10 +1,28 @@
|
|||
"""Globals used when testing."""
|
||||
|
||||
import datetime
|
||||
|
||||
from urban_meal_delivery import config
|
||||
|
||||
|
||||
# The day on which most test cases take place.
|
||||
YEAR, MONTH, DAY = 2016, 7, 1
|
||||
|
||||
# `START` and `END` constitute a 15-day time span.
|
||||
# That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks.
|
||||
START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0)
|
||||
_end_day = (START + datetime.timedelta(weeks=2)).date()
|
||||
END = datetime.datetime(
|
||||
_end_day.year, _end_day.month, _end_day.day, config.SERVICE_END, 0,
|
||||
)
|
||||
|
||||
# Default time steps, for example, for `OrderHistory` objects.
|
||||
LONG_TIME_STEP = 60
|
||||
SHORT_TIME_STEP = 30
|
||||
TIME_STEPS = (SHORT_TIME_STEP, LONG_TIME_STEP)
|
||||
|
||||
# Default training horizons, for example, for
|
||||
# `OrderHistory.make_horizontal_time_series()`.
|
||||
LONG_TRAIN_HORIZON = 2
|
||||
SHORT_TRAIN_HORIZON = 1
|
||||
TRAIN_HORIZONS = (SHORT_TRAIN_HORIZON, LONG_TRAIN_HORIZON)
|
||||
|
|
173
tests/forecasts/timify/test_make_time_series.py
Normal file
173
tests/forecasts/timify/test_make_time_series.py
Normal file
|
@ -0,0 +1,173 @@
|
|||
"""Test the time series related code."""
|
||||
# pylint:disable=no-self-use,unused-argument
|
||||
|
||||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from tests import config as test_config
|
||||
from urban_meal_delivery import config
|
||||
from urban_meal_delivery.forecasts import timify
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def good_pixel_id():
|
||||
"""A `pixel_id` that is on the `grid`."""
|
||||
return 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def order_totals(good_pixel_id):
|
||||
"""A mock for `OrderHistory.totals`.
|
||||
|
||||
To be a bit more realistic, we sample two pixels on the `grid`.
|
||||
"""
|
||||
pixel_ids = [good_pixel_id, good_pixel_id + 1]
|
||||
|
||||
gen = (
|
||||
(pixel_id, start_at)
|
||||
for pixel_id in pixel_ids
|
||||
for start_at in pd.date_range(
|
||||
test_config.START, test_config.END, freq=f'{test_config.LONG_TIME_STEP}T',
|
||||
)
|
||||
if config.SERVICE_START <= start_at.hour < config.SERVICE_END
|
||||
)
|
||||
|
||||
# Re-index `data` filling in `0`s where there is no demand.
|
||||
index = pd.MultiIndex.from_tuples(gen)
|
||||
index.names = ['pixel_id', 'start_at']
|
||||
|
||||
df = pd.DataFrame(data={'total_orders': 0}, index=index)
|
||||
|
||||
# Sanity check: n_pixels * n_time_steps_per_day * n_weekdays * n_weeks.
|
||||
assert len(df) == 2 * 12 * (7 * 2 + 1)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def order_history(order_totals, grid):
|
||||
"""An `OrderHistory` object that does not need the database."""
|
||||
oh = timify.OrderHistory(grid=grid, time_step=test_config.LONG_TIME_STEP)
|
||||
oh._data = order_totals # pylint:disable=protected-access
|
||||
|
||||
return oh
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def good_predict_at():
|
||||
"""A `predict_at` within `START`-`END` and ...
|
||||
|
||||
... a long enough history so that either `train_horizon=1`
|
||||
or `train_horizon=2` works.
|
||||
"""
|
||||
return datetime.datetime(
|
||||
test_config.END.year, test_config.END.month, test_config.END.day, 12, 0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bad_predict_at():
|
||||
"""A `predict_at` within `START`-`END` but ...
|
||||
|
||||
... not a long enough history so that both `train_horizon=1`
|
||||
and `train_horizon=2` do not work.
|
||||
"""
|
||||
predict_day = test_config.END - datetime.timedelta(weeks=1, days=1)
|
||||
return datetime.datetime(
|
||||
predict_day.year, predict_day.month, predict_day.day, 12, 0,
|
||||
)
|
||||
|
||||
|
||||
class TestMakeHorizontalTimeSeries:
|
||||
"""Test the `OrderHistory.make_horizontal_time_series()` method."""
|
||||
|
||||
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||
def test_wrong_pixel(self, order_history, good_predict_at, train_horizon):
|
||||
"""A `pixel_id` that is not in the `grid`."""
|
||||
with pytest.raises(LookupError):
|
||||
order_history.make_horizontal_time_series(
|
||||
pixel_id=999_999,
|
||||
predict_at=good_predict_at,
|
||||
train_horizon=train_horizon,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||
def test_time_series_are_dataframes(
|
||||
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||
):
|
||||
"""The time series come in a one-column `pd.DataFrame`."""
|
||||
result = order_history.make_horizontal_time_series(
|
||||
pixel_id=good_pixel_id,
|
||||
predict_at=good_predict_at,
|
||||
train_horizon=train_horizon,
|
||||
)
|
||||
|
||||
training_df, _, actual_df = result
|
||||
|
||||
assert isinstance(training_df, pd.DataFrame)
|
||||
assert training_df.columns == ['total_orders']
|
||||
assert isinstance(actual_df, pd.DataFrame)
|
||||
assert actual_df.columns == ['total_orders']
|
||||
|
||||
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||
def test_time_series_have_correct_length(
|
||||
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||
):
|
||||
"""The length of a training time series must be a multiple of `7` ...
|
||||
|
||||
whereas the time series with the actual order counts always holds `1` value.
|
||||
"""
|
||||
result = order_history.make_horizontal_time_series(
|
||||
pixel_id=good_pixel_id,
|
||||
predict_at=good_predict_at,
|
||||
train_horizon=train_horizon,
|
||||
)
|
||||
|
||||
training_df, _, actual_df = result
|
||||
|
||||
assert len(training_df) == 7 * train_horizon
|
||||
assert len(actual_df) == 1
|
||||
|
||||
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||
def test_frequency_is_number_of_weekdays(
|
||||
self, order_history, good_pixel_id, good_predict_at, train_horizon,
|
||||
):
|
||||
"""The `frequency` must be `7`."""
|
||||
result = order_history.make_horizontal_time_series(
|
||||
pixel_id=good_pixel_id,
|
||||
predict_at=good_predict_at,
|
||||
train_horizon=train_horizon,
|
||||
)
|
||||
|
||||
_, frequency, _ = result # noqa:WPS434
|
||||
|
||||
assert frequency == 7
|
||||
|
||||
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
|
||||
def test_no_long_enough_history1(
|
||||
self, order_history, good_pixel_id, bad_predict_at, train_horizon,
|
||||
):
|
||||
"""If the `predict_at` day is too early in the `START`-`END` horizon ...
|
||||
|
||||
... the history of order totals is not long enough.
|
||||
"""
|
||||
with pytest.raises(RuntimeError):
|
||||
order_history.make_horizontal_time_series(
|
||||
pixel_id=good_pixel_id,
|
||||
predict_at=bad_predict_at,
|
||||
train_horizon=train_horizon,
|
||||
)
|
||||
|
||||
def test_no_long_enough_history2(
|
||||
self, order_history, good_pixel_id, good_predict_at,
|
||||
):
|
||||
"""If the `train_horizon` is longer than the `START`-`END` horizon ...
|
||||
|
||||
... the history of order totals can never be long enough.
|
||||
"""
|
||||
with pytest.raises(RuntimeError):
|
||||
order_history.make_horizontal_time_series(
|
||||
pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999,
|
||||
)
|
Loading…
Reference in a new issue