Add OrderHistory.make_horizontal_time_series()

- the method slices out a horizontal time series from the data within
  an `OrderHistory` object
This commit is contained in:
Alexander Hess 2021-01-09 16:34:42 +01:00
commit b61db734b6
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
3 changed files with 258 additions and 0 deletions

View file

@ -1,6 +1,7 @@
"""Obtain and work with time series data."""
import datetime as dt
from typing import Tuple
import pandas as pd
@ -145,3 +146,69 @@ class OrderHistory:
index.names = ['pixel_id', 'start_at']
return data.reindex(index, fill_value=0)
def make_horizontal_time_series( # noqa:WPS210
self, pixel_id: int, predict_at: dt.datetime, train_horizon: int,
) -> Tuple[pd.DataFrame, int, int]:
"""Slice a horizontal time series out of the `.totals`.
Create a time series covering `train_horizon` weeks that can be used
for training a forecasting model to predict the demand at `predict_at`.
For explanation of the terms "horizontal", "vertical", and "real-time"
in the context of time series, see section 3.2 in the following paper:
https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
Args:
pixel_id: pixel in which the time series is aggregated
predict_at: time step (i.e., "start_at") for which a prediction is made
train_horizon: weeks of historic data used to predict `predict_at`
Returns:
training time series, frequency, actual order count at `predict_at`
Raises:
LookupError: `pixel_id` is not in the `grid`
RuntimeError: desired time series slice is not entirely in `.totals`
"""
try:
intra_pixel = self.totals.loc[pixel_id]
except KeyError:
raise LookupError('The `pixel_id` is not in the `grid`') from None
if predict_at >= config.CUTOFF_DAY: # pragma: no cover
raise RuntimeError('Internal error: cannot predict beyond the given data')
# The first and last training day are just before the `predict_at` day
# and span exactly `train_horizon` weeks covering only the times of the
# day equal to the hour/minute of `predict_at`.
first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon)
first_start_at = dt.datetime(
first_train_day.year,
first_train_day.month,
first_train_day.day,
predict_at.hour,
predict_at.minute,
)
last_train_day = predict_at.date() - dt.timedelta(days=1)
last_start_at = dt.datetime(
last_train_day.year,
last_train_day.month,
last_train_day.day,
predict_at.hour,
predict_at.minute,
)
# The frequency is the number of weekdays.
frequency = 7
# Take only the counts at the `predict_at` time.
training_df = intra_pixel.loc[
first_start_at : last_start_at : self._n_daily_time_steps # type: ignore
]
if len(training_df) != frequency * train_horizon:
raise RuntimeError('Not enough historic data for `predict_at`')
actual_df = intra_pixel.loc[[predict_at]]
return training_df, frequency, actual_df