Add OrderHistory.make_vertical_time_series()

- the method slices out a vertical time series from the data within an `OrderHistory` object
2021-01-09 17:00:10 +01:00 · 2021-01-09 17:00:10 +01:00 · 5330ceb771
commit 5330ceb771
parent b61db734b6
2 changed files with 194 additions and 1 deletions
--- a/src/urban_meal_delivery/forecasts/timify.py
+++ b/src/urban_meal_delivery/forecasts/timify.py
@ -212,3 +212,85 @@ class OrderHistory:
        actual_df = intra_pixel.loc[[predict_at]]

        return training_df, frequency, actual_df
+
+    def make_vertical_time_series(  # noqa:WPS210
+        self, pixel_id: int, predict_day: dt.date, train_horizon: int,
+    ) -> Tuple[pd.DataFrame, int, pd.DataFrame]:
+        """Slice a vertical time series out of the `.totals`.
+
+        Create a time series covering `train_horizon` weeks that can be used
+        for training a forecasting model to predict the demand on the `predict_day`.
+
+        For explanation of the terms "horizontal", "vertical", and "real-time"
+        in the context of time series, see section 3.2 in the following paper:
+        https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
+
+        Args:
+            pixel_id: pixel in which the time series is aggregated
+            predict_day: day for which predictions are made
+            train_horizon: weeks of historic data used to predict `predict_at`
+
+        Returns:
+            training time series, frequency, actual order counts on `predict_day`
+
+        Raises:
+            LookupError: `pixel_id` is not in the `grid`
+            RuntimeError: desired time series slice is not entirely in `.totals`
+        """
+        try:
+            intra_pixel = self.totals.loc[pixel_id]
+        except KeyError:
+            raise LookupError('The `pixel_id` is not in the `grid`') from None
+
+        if predict_day >= config.CUTOFF_DAY.date():  # pragma: no cover
+            raise RuntimeError('Internal error: cannot predict beyond the given data')
+
+        # The first and last training day are just before the `predict_day`
+        # and span exactly `train_horizon` weeks covering all times of the day.
+        first_train_day = predict_day - dt.timedelta(weeks=train_horizon)
+        first_start_at = dt.datetime(
+            first_train_day.year,
+            first_train_day.month,
+            first_train_day.day,
+            config.SERVICE_START,
+            0,
+        )
+        last_train_day = predict_day - dt.timedelta(days=1)
+        last_start_at = dt.datetime(
+            last_train_day.year,
+            last_train_day.month,
+            last_train_day.day,
+            config.SERVICE_END,  # subtract one `time_step` below
+            0,
+        ) - dt.timedelta(minutes=self._time_step)
+
+        # The frequency is the number of weekdays times the number of daily time steps.
+        frequency = 7 * self._n_daily_time_steps
+
+        # Take all the counts between `first_train_day` and `last_train_day`.
+        training_df = intra_pixel.loc[
+            first_start_at:last_start_at  # type: ignore
+        ]
+        if len(training_df) != frequency * train_horizon:
+            raise RuntimeError('Not enough historic data for `predict_day`')
+
+        first_prediction_at = dt.datetime(
+            predict_day.year,
+            predict_day.month,
+            predict_day.day,
+            config.SERVICE_START,
+            0,
+        )
+        last_prediction_at = dt.datetime(
+            predict_day.year,
+            predict_day.month,
+            predict_day.day,
+            config.SERVICE_END,  # subtract one `time_step` below
+            0,
+        ) - dt.timedelta(minutes=self._time_step)
+
+        actuals_df = intra_pixel.loc[
+            first_prediction_at:last_prediction_at  # type: ignore
+        ]
+
+        return training_df, frequency, actuals_df
--- a/tests/forecasts/timify/test_make_time_series.py
+++ b/tests/forecasts/timify/test_make_time_series.py
@ -117,7 +117,7 @@ class TestMakeHorizontalTimeSeries:
    ):
        """The length of a training time series must be a multiple of `7` ...

-        whereas the time series with the actual order counts always holds `1` value.
+        ... whereas the time series with the actual order counts has only `1` value.
        """
        result = order_history.make_horizontal_time_series(
            pixel_id=good_pixel_id,
@ -171,3 +171,114 @@ class TestMakeHorizontalTimeSeries:
            order_history.make_horizontal_time_series(
                pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999,
            )
+
+
+class TestMakeVerticalTimeSeries:
+    """Test the `OrderHistory.make_vertical_time_series()` method."""
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_wrong_pixel(self, order_history, good_predict_at, train_horizon):
+        """A `pixel_id` that is not in the `grid`."""
+        with pytest.raises(LookupError):
+            order_history.make_vertical_time_series(
+                pixel_id=999_999,
+                predict_day=good_predict_at.date(),
+                train_horizon=train_horizon,
+            )
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_time_series_are_dataframes(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The time series come in a one-column `pd.DataFrame`."""
+        result = order_history.make_vertical_time_series(
+            pixel_id=good_pixel_id,
+            predict_day=good_predict_at.date(),
+            train_horizon=train_horizon,
+        )
+
+        training_df, _, actual_df = result
+
+        assert isinstance(training_df, pd.DataFrame)
+        assert training_df.columns == ['total_orders']
+        assert isinstance(actual_df, pd.DataFrame)
+        assert actual_df.columns == ['total_orders']
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_time_series_have_correct_length(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The length of a training time series is the product of the ...
+
+        ... weekly time steps (i.e., product of `7` and the number of daily time steps)
+        and the `train_horizon` in weeks.
+
+        The time series with the actual order counts always holds one observation
+        per time step of a day.
+        """
+        result = order_history.make_vertical_time_series(
+            pixel_id=good_pixel_id,
+            predict_day=good_predict_at.date(),
+            train_horizon=train_horizon,
+        )
+
+        training_df, _, actual_df = result
+
+        n_daily_time_steps = (
+            60
+            * (config.SERVICE_END - config.SERVICE_START)
+            // test_config.LONG_TIME_STEP
+        )
+
+        assert len(training_df) == 7 * n_daily_time_steps * train_horizon
+        assert len(actual_df) == n_daily_time_steps
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_frequency_is_number_number_of_weekly_time_steps(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The `frequency` is the number of weekly time steps."""
+        result = order_history.make_vertical_time_series(
+            pixel_id=good_pixel_id,
+            predict_day=good_predict_at.date(),
+            train_horizon=train_horizon,
+        )
+
+        _, frequency, _ = result  # noqa:WPS434
+
+        n_daily_time_steps = (
+            60
+            * (config.SERVICE_END - config.SERVICE_START)
+            // test_config.LONG_TIME_STEP
+        )
+
+        assert frequency == 7 * n_daily_time_steps
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_no_long_enough_history1(
+        self, order_history, good_pixel_id, bad_predict_at, train_horizon,
+    ):
+        """If the `predict_at` day is too early in the `START`-`END` horizon ...
+
+        ... the history of order totals is not long enough.
+        """
+        with pytest.raises(RuntimeError):
+            order_history.make_vertical_time_series(
+                pixel_id=good_pixel_id,
+                predict_day=bad_predict_at.date(),
+                train_horizon=train_horizon,
+            )
+
+    def test_no_long_enough_history2(
+        self, order_history, good_pixel_id, good_predict_at,
+    ):
+        """If the `train_horizon` is longer than the `START`-`END` horizon ...
+
+        ... the history of order totals can never be long enough.
+        """
+        with pytest.raises(RuntimeError):
+            order_history.make_vertical_time_series(
+                pixel_id=good_pixel_id,
+                predict_day=good_predict_at.date(),
+                train_horizon=999,
+            )