Add OrderHistory.make_horizontal_time_series()

- the method slices out a horizontal time series from the data within
  an `OrderHistory` object
This commit is contained in:
Alexander Hess 2021-01-09 16:34:42 +01:00
parent 65d1632e98
commit b61db734b6
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
3 changed files with 258 additions and 0 deletions

View file

@ -1,6 +1,7 @@
"""Obtain and work with time series data.""" """Obtain and work with time series data."""
import datetime as dt import datetime as dt
from typing import Tuple
import pandas as pd import pandas as pd
@ -145,3 +146,69 @@ class OrderHistory:
index.names = ['pixel_id', 'start_at'] index.names = ['pixel_id', 'start_at']
return data.reindex(index, fill_value=0) return data.reindex(index, fill_value=0)
def make_horizontal_time_series( # noqa:WPS210
self, pixel_id: int, predict_at: dt.datetime, train_horizon: int,
) -> Tuple[pd.DataFrame, int, int]:
"""Slice a horizontal time series out of the `.totals`.
Create a time series covering `train_horizon` weeks that can be used
for training a forecasting model to predict the demand at `predict_at`.
For explanation of the terms "horizontal", "vertical", and "real-time"
in the context of time series, see section 3.2 in the following paper:
https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
Args:
pixel_id: pixel in which the time series is aggregated
predict_at: time step (i.e., "start_at") for which a prediction is made
train_horizon: weeks of historic data used to predict `predict_at`
Returns:
training time series, frequency, actual order count at `predict_at`
Raises:
LookupError: `pixel_id` is not in the `grid`
RuntimeError: desired time series slice is not entirely in `.totals`
"""
try:
intra_pixel = self.totals.loc[pixel_id]
except KeyError:
raise LookupError('The `pixel_id` is not in the `grid`') from None
if predict_at >= config.CUTOFF_DAY: # pragma: no cover
raise RuntimeError('Internal error: cannot predict beyond the given data')
# The first and last training day are just before the `predict_at` day
# and span exactly `train_horizon` weeks covering only the times of the
# day equal to the hour/minute of `predict_at`.
first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon)
first_start_at = dt.datetime(
first_train_day.year,
first_train_day.month,
first_train_day.day,
predict_at.hour,
predict_at.minute,
)
last_train_day = predict_at.date() - dt.timedelta(days=1)
last_start_at = dt.datetime(
last_train_day.year,
last_train_day.month,
last_train_day.day,
predict_at.hour,
predict_at.minute,
)
# The frequency is the number of weekdays.
frequency = 7
# Take only the counts at the `predict_at` time.
training_df = intra_pixel.loc[
first_start_at : last_start_at : self._n_daily_time_steps # type: ignore
]
if len(training_df) != frequency * train_horizon:
raise RuntimeError('Not enough historic data for `predict_at`')
actual_df = intra_pixel.loc[[predict_at]]
return training_df, frequency, actual_df

View file

@ -1,10 +1,28 @@
"""Globals used when testing.""" """Globals used when testing."""
import datetime
from urban_meal_delivery import config
# The day on which most test cases take place. # The day on which most test cases take place.
YEAR, MONTH, DAY = 2016, 7, 1 YEAR, MONTH, DAY = 2016, 7, 1
# `START` and `END` constitute a 15-day time span.
# That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks.
START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0)
_end_day = (START + datetime.timedelta(weeks=2)).date()
END = datetime.datetime(
_end_day.year, _end_day.month, _end_day.day, config.SERVICE_END, 0,
)
# Default time steps, for example, for `OrderHistory` objects. # Default time steps, for example, for `OrderHistory` objects.
LONG_TIME_STEP = 60 LONG_TIME_STEP = 60
SHORT_TIME_STEP = 30 SHORT_TIME_STEP = 30
TIME_STEPS = (SHORT_TIME_STEP, LONG_TIME_STEP) TIME_STEPS = (SHORT_TIME_STEP, LONG_TIME_STEP)
# Default training horizons, for example, for
# `OrderHistory.make_horizontal_time_series()`.
LONG_TRAIN_HORIZON = 2
SHORT_TRAIN_HORIZON = 1
TRAIN_HORIZONS = (SHORT_TRAIN_HORIZON, LONG_TRAIN_HORIZON)

View file

@ -0,0 +1,173 @@
"""Test the time series related code."""
# pylint:disable=no-self-use,unused-argument
import datetime
import pandas as pd
import pytest
from tests import config as test_config
from urban_meal_delivery import config
from urban_meal_delivery.forecasts import timify
@pytest.fixture
def good_pixel_id():
"""A `pixel_id` that is on the `grid`."""
return 1
@pytest.fixture
def order_totals(good_pixel_id):
"""A mock for `OrderHistory.totals`.
To be a bit more realistic, we sample two pixels on the `grid`.
"""
pixel_ids = [good_pixel_id, good_pixel_id + 1]
gen = (
(pixel_id, start_at)
for pixel_id in pixel_ids
for start_at in pd.date_range(
test_config.START, test_config.END, freq=f'{test_config.LONG_TIME_STEP}T',
)
if config.SERVICE_START <= start_at.hour < config.SERVICE_END
)
# Re-index `data` filling in `0`s where there is no demand.
index = pd.MultiIndex.from_tuples(gen)
index.names = ['pixel_id', 'start_at']
df = pd.DataFrame(data={'total_orders': 0}, index=index)
# Sanity check: n_pixels * n_time_steps_per_day * n_weekdays * n_weeks.
assert len(df) == 2 * 12 * (7 * 2 + 1)
return df
@pytest.fixture
def order_history(order_totals, grid):
"""An `OrderHistory` object that does not need the database."""
oh = timify.OrderHistory(grid=grid, time_step=test_config.LONG_TIME_STEP)
oh._data = order_totals # pylint:disable=protected-access
return oh
@pytest.fixture
def good_predict_at():
"""A `predict_at` within `START`-`END` and ...
... a long enough history so that either `train_horizon=1`
or `train_horizon=2` works.
"""
return datetime.datetime(
test_config.END.year, test_config.END.month, test_config.END.day, 12, 0,
)
@pytest.fixture
def bad_predict_at():
"""A `predict_at` within `START`-`END` but ...
... not a long enough history so that both `train_horizon=1`
and `train_horizon=2` do not work.
"""
predict_day = test_config.END - datetime.timedelta(weeks=1, days=1)
return datetime.datetime(
predict_day.year, predict_day.month, predict_day.day, 12, 0,
)
class TestMakeHorizontalTimeSeries:
"""Test the `OrderHistory.make_horizontal_time_series()` method."""
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
def test_wrong_pixel(self, order_history, good_predict_at, train_horizon):
"""A `pixel_id` that is not in the `grid`."""
with pytest.raises(LookupError):
order_history.make_horizontal_time_series(
pixel_id=999_999,
predict_at=good_predict_at,
train_horizon=train_horizon,
)
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
def test_time_series_are_dataframes(
self, order_history, good_pixel_id, good_predict_at, train_horizon,
):
"""The time series come in a one-column `pd.DataFrame`."""
result = order_history.make_horizontal_time_series(
pixel_id=good_pixel_id,
predict_at=good_predict_at,
train_horizon=train_horizon,
)
training_df, _, actual_df = result
assert isinstance(training_df, pd.DataFrame)
assert training_df.columns == ['total_orders']
assert isinstance(actual_df, pd.DataFrame)
assert actual_df.columns == ['total_orders']
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
def test_time_series_have_correct_length(
self, order_history, good_pixel_id, good_predict_at, train_horizon,
):
"""The length of a training time series must be a multiple of `7` ...
whereas the time series with the actual order counts always holds `1` value.
"""
result = order_history.make_horizontal_time_series(
pixel_id=good_pixel_id,
predict_at=good_predict_at,
train_horizon=train_horizon,
)
training_df, _, actual_df = result
assert len(training_df) == 7 * train_horizon
assert len(actual_df) == 1
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
def test_frequency_is_number_of_weekdays(
self, order_history, good_pixel_id, good_predict_at, train_horizon,
):
"""The `frequency` must be `7`."""
result = order_history.make_horizontal_time_series(
pixel_id=good_pixel_id,
predict_at=good_predict_at,
train_horizon=train_horizon,
)
_, frequency, _ = result # noqa:WPS434
assert frequency == 7
@pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
def test_no_long_enough_history1(
self, order_history, good_pixel_id, bad_predict_at, train_horizon,
):
"""If the `predict_at` day is too early in the `START`-`END` horizon ...
... the history of order totals is not long enough.
"""
with pytest.raises(RuntimeError):
order_history.make_horizontal_time_series(
pixel_id=good_pixel_id,
predict_at=bad_predict_at,
train_horizon=train_horizon,
)
def test_no_long_enough_history2(
self, order_history, good_pixel_id, good_predict_at,
):
"""If the `train_horizon` is longer than the `START`-`END` horizon ...
... the history of order totals can never be long enough.
"""
with pytest.raises(RuntimeError):
order_history.make_horizontal_time_series(
pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999,
)