From b61db734b666521b32c0d3d5273b3752e9eb6bad Mon Sep 17 00:00:00 2001
From: Alexander Hess <alexander@webartifex.biz>
Date: Sat, 9 Jan 2021 16:34:42 +0100
Subject: [PATCH] Add `OrderHistory.make_horizontal_time_series()`

- the method slices out a horizontal time series from the data within
  an `OrderHistory` object
---
 src/urban_meal_delivery/forecasts/timify.py   |  67 +++++++
 tests/config.py                               |  18 ++
 .../forecasts/timify/test_make_time_series.py | 173 ++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 tests/forecasts/timify/test_make_time_series.py

diff --git a/src/urban_meal_delivery/forecasts/timify.py b/src/urban_meal_delivery/forecasts/timify.py
index 87d26b5..d9e43fc 100644
--- a/src/urban_meal_delivery/forecasts/timify.py
+++ b/src/urban_meal_delivery/forecasts/timify.py
@@ -1,6 +1,7 @@
 """Obtain and work with time series data."""
 
 import datetime as dt
+from typing import Tuple
 
 import pandas as pd
 
@@ -145,3 +146,69 @@ class OrderHistory:
         index.names = ['pixel_id', 'start_at']
 
         return data.reindex(index, fill_value=0)
+
+    def make_horizontal_time_series(  # noqa:WPS210
+        self, pixel_id: int, predict_at: dt.datetime, train_horizon: int,
+    ) -> Tuple[pd.DataFrame, int, int]:
+        """Slice a horizontal time series out of the `.totals`.
+
+        Create a time series covering `train_horizon` weeks that can be used
+        for training a forecasting model to predict the demand at `predict_at`.
+
+        For explanation of the terms "horizontal", "vertical", and "real-time"
+        in the context of time series, see section 3.2 in the following paper:
+        https://github.com/webartifex/urban-meal-delivery-demand-forecasting/blob/main/paper.pdf
+
+        Args:
+            pixel_id: pixel in which the time series is aggregated
+            predict_at: time step (i.e., "start_at") for which a prediction is made
+            train_horizon: weeks of historic data used to predict `predict_at`
+
+        Returns:
+            training time series, frequency, actual order count at `predict_at`
+
+        Raises:
+            LookupError: `pixel_id` is not in the `grid`
+            RuntimeError: desired time series slice is not entirely in `.totals`
+        """
+        try:
+            intra_pixel = self.totals.loc[pixel_id]
+        except KeyError:
+            raise LookupError('The `pixel_id` is not in the `grid`') from None
+
+        if predict_at >= config.CUTOFF_DAY:  # pragma: no cover
+            raise RuntimeError('Internal error: cannot predict beyond the given data')
+
+        # The first and last training day are just before the `predict_at` day
+        # and span exactly `train_horizon` weeks covering only the times of the
+        # day equal to the hour/minute of `predict_at`.
+        first_train_day = predict_at.date() - dt.timedelta(weeks=train_horizon)
+        first_start_at = dt.datetime(
+            first_train_day.year,
+            first_train_day.month,
+            first_train_day.day,
+            predict_at.hour,
+            predict_at.minute,
+        )
+        last_train_day = predict_at.date() - dt.timedelta(days=1)
+        last_start_at = dt.datetime(
+            last_train_day.year,
+            last_train_day.month,
+            last_train_day.day,
+            predict_at.hour,
+            predict_at.minute,
+        )
+
+        # The frequency is the number of weekdays.
+        frequency = 7
+
+        # Take only the counts at the `predict_at` time.
+        training_df = intra_pixel.loc[
+            first_start_at : last_start_at : self._n_daily_time_steps  # type: ignore
+        ]
+        if len(training_df) != frequency * train_horizon:
+            raise RuntimeError('Not enough historic data for `predict_at`')
+
+        actual_df = intra_pixel.loc[[predict_at]]
+
+        return training_df, frequency, actual_df
diff --git a/tests/config.py b/tests/config.py
index 288c471..c2e3175 100644
--- a/tests/config.py
+++ b/tests/config.py
@@ -1,10 +1,28 @@
 """Globals used when testing."""
 
+import datetime
+
+from urban_meal_delivery import config
+
 
 # The day on which most test cases take place.
 YEAR, MONTH, DAY = 2016, 7, 1
 
+# `START` and `END` constitute a 15-day time span.
+# That implies a maximum `train_horizon` of `2` as that needs full 7-day weeks.
+START = datetime.datetime(YEAR, MONTH, DAY, config.SERVICE_START, 0)
+_end_day = (START + datetime.timedelta(weeks=2)).date()
+END = datetime.datetime(
+    _end_day.year, _end_day.month, _end_day.day, config.SERVICE_END, 0,
+)
+
 # Default time steps, for example, for `OrderHistory` objects.
 LONG_TIME_STEP = 60
 SHORT_TIME_STEP = 30
 TIME_STEPS = (SHORT_TIME_STEP, LONG_TIME_STEP)
+
+# Default training horizons, for example, for
+# `OrderHistory.make_horizontal_time_series()`.
+LONG_TRAIN_HORIZON = 2
+SHORT_TRAIN_HORIZON = 1
+TRAIN_HORIZONS = (SHORT_TRAIN_HORIZON, LONG_TRAIN_HORIZON)
diff --git a/tests/forecasts/timify/test_make_time_series.py b/tests/forecasts/timify/test_make_time_series.py
new file mode 100644
index 0000000..bb7f682
--- /dev/null
+++ b/tests/forecasts/timify/test_make_time_series.py
@@ -0,0 +1,173 @@
+"""Test the time series related code."""
+# pylint:disable=no-self-use,unused-argument
+
+import datetime
+
+import pandas as pd
+import pytest
+
+from tests import config as test_config
+from urban_meal_delivery import config
+from urban_meal_delivery.forecasts import timify
+
+
+@pytest.fixture
+def good_pixel_id():
+    """A `pixel_id` that is on the `grid`."""
+    return 1
+
+
+@pytest.fixture
+def order_totals(good_pixel_id):
+    """A mock for `OrderHistory.totals`.
+
+    To be a bit more realistic, we sample two pixels on the `grid`.
+    """
+    pixel_ids = [good_pixel_id, good_pixel_id + 1]
+
+    gen = (
+        (pixel_id, start_at)
+        for pixel_id in pixel_ids
+        for start_at in pd.date_range(
+            test_config.START, test_config.END, freq=f'{test_config.LONG_TIME_STEP}T',
+        )
+        if config.SERVICE_START <= start_at.hour < config.SERVICE_END
+    )
+
+    # Re-index `data` filling in `0`s where there is no demand.
+    index = pd.MultiIndex.from_tuples(gen)
+    index.names = ['pixel_id', 'start_at']
+
+    df = pd.DataFrame(data={'total_orders': 0}, index=index)
+
+    # Sanity check: n_pixels * n_time_steps_per_day * n_weekdays * n_weeks.
+    assert len(df) == 2 * 12 * (7 * 2 + 1)
+
+    return df
+
+
+@pytest.fixture
+def order_history(order_totals, grid):
+    """An `OrderHistory` object that does not need the database."""
+    oh = timify.OrderHistory(grid=grid, time_step=test_config.LONG_TIME_STEP)
+    oh._data = order_totals  # pylint:disable=protected-access
+
+    return oh
+
+
+@pytest.fixture
+def good_predict_at():
+    """A `predict_at` within `START`-`END` and ...
+
+    ... a long enough history so that either `train_horizon=1`
+    or `train_horizon=2` works.
+    """
+    return datetime.datetime(
+        test_config.END.year, test_config.END.month, test_config.END.day, 12, 0,
+    )
+
+
+@pytest.fixture
+def bad_predict_at():
+    """A `predict_at` within `START`-`END` but ...
+
+    ... not a long enough history so that both `train_horizon=1`
+    and `train_horizon=2` do not work.
+    """
+    predict_day = test_config.END - datetime.timedelta(weeks=1, days=1)
+    return datetime.datetime(
+        predict_day.year, predict_day.month, predict_day.day, 12, 0,
+    )
+
+
+class TestMakeHorizontalTimeSeries:
+    """Test the `OrderHistory.make_horizontal_time_series()` method."""
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_wrong_pixel(self, order_history, good_predict_at, train_horizon):
+        """A `pixel_id` that is not in the `grid`."""
+        with pytest.raises(LookupError):
+            order_history.make_horizontal_time_series(
+                pixel_id=999_999,
+                predict_at=good_predict_at,
+                train_horizon=train_horizon,
+            )
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_time_series_are_dataframes(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The time series come in a one-column `pd.DataFrame`."""
+        result = order_history.make_horizontal_time_series(
+            pixel_id=good_pixel_id,
+            predict_at=good_predict_at,
+            train_horizon=train_horizon,
+        )
+
+        training_df, _, actual_df = result
+
+        assert isinstance(training_df, pd.DataFrame)
+        assert training_df.columns == ['total_orders']
+        assert isinstance(actual_df, pd.DataFrame)
+        assert actual_df.columns == ['total_orders']
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_time_series_have_correct_length(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The length of a training time series must be a multiple of `7` ...
+
+        whereas the time series with the actual order counts always holds `1` value.
+        """
+        result = order_history.make_horizontal_time_series(
+            pixel_id=good_pixel_id,
+            predict_at=good_predict_at,
+            train_horizon=train_horizon,
+        )
+
+        training_df, _, actual_df = result
+
+        assert len(training_df) == 7 * train_horizon
+        assert len(actual_df) == 1
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_frequency_is_number_of_weekdays(
+        self, order_history, good_pixel_id, good_predict_at, train_horizon,
+    ):
+        """The `frequency` must be `7`."""
+        result = order_history.make_horizontal_time_series(
+            pixel_id=good_pixel_id,
+            predict_at=good_predict_at,
+            train_horizon=train_horizon,
+        )
+
+        _, frequency, _ = result  # noqa:WPS434
+
+        assert frequency == 7
+
+    @pytest.mark.parametrize('train_horizon', test_config.TRAIN_HORIZONS)
+    def test_no_long_enough_history1(
+        self, order_history, good_pixel_id, bad_predict_at, train_horizon,
+    ):
+        """If the `predict_at` day is too early in the `START`-`END` horizon ...
+
+        ... the history of order totals is not long enough.
+        """
+        with pytest.raises(RuntimeError):
+            order_history.make_horizontal_time_series(
+                pixel_id=good_pixel_id,
+                predict_at=bad_predict_at,
+                train_horizon=train_horizon,
+            )
+
+    def test_no_long_enough_history2(
+        self, order_history, good_pixel_id, good_predict_at,
+    ):
+        """If the `train_horizon` is longer than the `START`-`END` horizon ...
+
+        ... the history of order totals can never be long enough.
+        """
+        with pytest.raises(RuntimeError):
+            order_history.make_horizontal_time_series(
+                pixel_id=good_pixel_id, predict_at=good_predict_at, train_horizon=999,
+            )