From 100fac659ad663df11af7d12ad649fe56ebc6652 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Sat, 9 Jan 2021 17:30:00 +0100 Subject: [PATCH] Add `OrderHistory.make_real_time_time_series()` - the method slices out a real-time time series from the data within an `OrderHistory` object --- src/urban_meal_delivery/forecasts/timify.py | 90 ++++++++++ tests/config.py | 3 + .../forecasts/timify/test_make_time_series.py | 167 +++++++++++++++++- 3 files changed, 257 insertions(+), 3 deletions(-) diff --git a/src/urban_meal_delivery/forecasts/timify.py b/src/urban_meal_delivery/forecasts/timify.py index 078c972..6906d24 100644 --- a/src/urban_meal_delivery/forecasts/timify.py +++ b/src/urban_meal_delivery/forecasts/timify.py @@ -294,3 +294,93 @@ class OrderHistory: ] return training_df, frequency, actuals_df + + def make_real_time_time_series( # noqa:WPS210 + self, pixel_id: int, predict_at: dt.datetime, train_horizon: int, + ) -> Tuple[pd.DataFrame, int, int]: + """Slice a vertical real-time time series out of the `.totals`. + + Create a time series covering `train_horizon` weeks that can be used + for training a forecasting model to predict the demand at `predict_at`. + + For explanation of the terms "horizontal", "vertical", and "real-time" + in the context of time series, see section 3.2 in the following paper: + https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf + + Args: + pixel_id: pixel in which the time series is aggregated + predict_at: time step (i.e., "start_at") for which a prediction is made + train_horizon: weeks of historic data used to predict `predict_at` + + Returns: + training time series, frequency, actual order count at `predict_at` + + Raises: + LookupError: `pixel_id` is not in the `grid` + RuntimeError: desired time series slice is not entirely in `.totals` + """ + try: + intra_pixel = self.totals.loc[pixel_id] + except KeyError: + raise LookupError('The `pixel_id` is not in the `grid`') from None + + if predict_at >= config.CUTOFF_DAY: # pragma: no cover + raise RuntimeError('Internal error: cannot predict beyond the given data') + + # The first and last training day are just before the `predict_at` day + # and span exactly `train_horizon` weeks covering all times of the day, + # including times on the `predict_at` day that are earlier than `predict_at`. + first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon) + first_start_at = dt.datetime( + first_train_day.year, + first_train_day.month, + first_train_day.day, + config.SERVICE_START, + 0, + ) + # Predicting the first time step on the `predict_at` day is a corner case. + # Then, the previous day is indeed the `last_train_day`. Predicting any + # other time step implies that the `predict_at` day is the `last_train_day`. + # `last_train_time` is the last "start_at" before the one being predicted. + if predict_at.hour == config.SERVICE_START: + last_train_day = predict_at.date() - dt.timedelta(days=1) + last_train_time = dt.time(config.SERVICE_END, 0) + else: + last_train_day = predict_at.date() + last_train_time = predict_at.time() + last_start_at = dt.datetime( + last_train_day.year, + last_train_day.month, + last_train_day.day, + last_train_time.hour, + last_train_time.minute, + ) - dt.timedelta(minutes=self._time_step) + + # The frequency is the number of weekdays times the number of daily time steps. + frequency = 7 * self._n_daily_time_steps + + # Take all the counts between `first_train_day` and `last_train_day`, + # including the ones on the `predict_at` day prior to `predict_at`. + training_df = intra_pixel.loc[ + first_start_at:last_start_at # type: ignore + ] + n_time_steps_on_predict_day = ( + ( + predict_at + - dt.datetime( + predict_at.year, + predict_at.month, + predict_at.day, + config.SERVICE_START, + 0, + ) + ).seconds + // 60 # -> minutes + // self._time_step + ) + if len(training_df) != frequency * train_horizon + n_time_steps_on_predict_day: + raise RuntimeError('Not enough historic data for `predict_day`') + + actual_df = intra_pixel.loc[[predict_at]] + + return training_df, frequency, actual_df diff --git a/tests/config.py b/tests/config.py index c2e3175..5c4c83c 100644 --- a/tests/config.py +++ b/tests/config.py @@ -8,6 +8,9 @@ from urban_meal_delivery import config # The day on which most test cases take place. YEAR, MONTH, DAY = 2016, 7, 1 +# The hour when most test cases take place. +NOON = 12 + # `START` and `END` constitute a 15-day time span. # That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks. START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0) diff --git a/tests/forecasts/timify/test_make_time_series.py b/tests/forecasts/timify/test_make_time_series.py index 521cd08..dc4eee9 100644 --- a/tests/forecasts/timify/test_make_time_series.py +++ b/tests/forecasts/timify/test_make_time_series.py @@ -1,4 +1,8 @@ -"""Test the time series related code.""" +"""Test the code generating time series with the order totals. + +Unless otherwise noted, each `time_step` is 60 minutes long implying +12 time steps per day (i.e., we use `LONG_TIME_STEP` by default). +""" # pylint:disable=no-self-use,unused-argument import datetime @@ -63,7 +67,11 @@ def good_predict_at(): or `train_horizon=2` works. """ return datetime.datetime( - test_config.END.year, test_config.END.month, test_config.END.day, 12, 0, + test_config.END.year, + test_config.END.month, + test_config.END.day, + test_config.NOON, + 0, ) @@ -76,7 +84,7 @@ def bad_predict_at(): """ predict_day = test_config.END - datetime.timedelta(weeks=1, days=1) return datetime.datetime( - predict_day.year, predict_day.month, predict_day.day, 12, 0, + predict_day.year, predict_day.month, predict_day.day, test_config.NOON, 0, ) @@ -282,3 +290,156 @@ class TestMakeVerticalTimeSeries: predict_day=good_predict_at.date(), train_horizon=999, ) + + +class TestMakeRealTimeTimeSeries: + """Test the `OrderHistory.make_real_time_time_series()` method.""" + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_wrong_pixel(self, order_history, good_predict_at, train_horizon): + """A `pixel_id` that is not in the `grid`.""" + with pytest.raises(LookupError): + order_history.make_real_time_time_series( + pixel_id=999_999, + predict_at=good_predict_at, + train_horizon=train_horizon, + ) + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_time_series_are_dataframes( + self, order_history, good_pixel_id, good_predict_at, train_horizon, + ): + """The time series come in a one-column `pd.DataFrame`.""" + result = order_history.make_real_time_time_series( + pixel_id=good_pixel_id, + predict_at=good_predict_at, + train_horizon=train_horizon, + ) + + training_df, _, actual_df = result + + assert isinstance(training_df, pd.DataFrame) + assert training_df.columns == ['total_orders'] + assert isinstance(actual_df, pd.DataFrame) + assert actual_df.columns == ['total_orders'] + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_time_series_have_correct_length1( + self, order_history, good_pixel_id, good_predict_at, train_horizon, + ): + """The length of a training time series is the product of the ... + + ... weekly time steps (i.e., product of `7` and the number of daily time steps) + and the `train_horizon` in weeks; however, this assertion only holds if + we predict the first `time_step` of the day. + + The time series with the actual order counts always holds `1` value. + """ + predict_at = datetime.datetime( + good_predict_at.year, + good_predict_at.month, + good_predict_at.day, + config.SERVICE_START, + 0, + ) + result = order_history.make_real_time_time_series( + pixel_id=good_pixel_id, predict_at=predict_at, train_horizon=train_horizon, + ) + + training_df, _, actual_df = result + + n_daily_time_steps = ( + 60 + * (config.SERVICE_END - config.SERVICE_START) + // test_config.LONG_TIME_STEP + ) + + assert len(training_df) == 7 * n_daily_time_steps * train_horizon + assert len(actual_df) == 1 + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_time_series_have_correct_length2( + self, order_history, good_pixel_id, good_predict_at, train_horizon, + ): + """The length of a training time series is the product of the ... + + ... weekly time steps (i.e., product of `7` and the number of daily time steps) + and the `train_horizon` in weeks; however, this assertion only holds if + we predict the first `time_step` of the day. Predicting any other `time_step` + means that the training time series becomes longer by the number of time steps + before the one being predicted. + + The time series with the actual order counts always holds `1` value. + """ + assert good_predict_at.hour == test_config.NOON + + result = order_history.make_real_time_time_series( + pixel_id=good_pixel_id, + predict_at=good_predict_at, + train_horizon=train_horizon, + ) + + training_df, _, actual_df = result + + n_daily_time_steps = ( + 60 + * (config.SERVICE_END - config.SERVICE_START) + // test_config.LONG_TIME_STEP + ) + n_time_steps_before = ( + 60 * (test_config.NOON - config.SERVICE_START) // test_config.LONG_TIME_STEP + ) + + assert ( + len(training_df) + == 7 * n_daily_time_steps * train_horizon + n_time_steps_before + ) + assert len(actual_df) == 1 + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_frequency_is_number_number_of_weekly_time_steps( + self, order_history, good_pixel_id, good_predict_at, train_horizon, + ): + """The `frequency` is the number of weekly time steps.""" + result = order_history.make_real_time_time_series( + pixel_id=good_pixel_id, + predict_at=good_predict_at, + train_horizon=train_horizon, + ) + + _, frequency, _ = result # noqa:WPS434 + + n_daily_time_steps = ( + 60 + * (config.SERVICE_END - config.SERVICE_START) + // test_config.LONG_TIME_STEP + ) + + assert frequency == 7 * n_daily_time_steps + + @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS) + def test_no_long_enough_history1( + self, order_history, good_pixel_id, bad_predict_at, train_horizon, + ): + """If the `predict_at` day is too early in the `START`-`END` horizon ... + + ... the history of order totals is not long enough. + """ + with pytest.raises(RuntimeError): + order_history.make_real_time_time_series( + pixel_id=good_pixel_id, + predict_at=bad_predict_at, + train_horizon=train_horizon, + ) + + def test_no_long_enough_history2( + self, order_history, good_pixel_id, good_predict_at, + ): + """If the `train_horizon` is longer than the `START`-`END` horizon ... + + ... the history of order totals can never be long enough. + """ + with pytest.raises(RuntimeError): + order_history.make_real_time_time_series( + pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999, + )