Add aggregate_orders() function

- the function queries the database and aggregates the ad-hoc orders by pixel and time steps into a demand time series - implement "heavy" integration tests for `aggregate_orders()` - make `pandas` a package dependency - streamline the `Config`
2021-01-07 23:18:40 +01:00 · 2021-01-07 23:18:40 +01:00 · d5b3efbca1
commit d5b3efbca1
parent e8c97dd7da
10 changed files with 460 additions and 6 deletions
--- a/src/urban_meal_delivery/configuration.py
+++ b/src/urban_meal_delivery/configuration.py
@ -26,14 +26,38 @@ def random_schema_name() -> str:
 class Config:
    """Configuration that applies in all situations."""

+    # Application-specific settings
+    # -----------------------------
+
+    # Date after which the real-life data is discarded.
    CUTOFF_DAY = datetime.datetime(2017, 2, 1)

    # If a scheduled pre-order is made within this
    # time horizon, we treat it as an ad-hoc order.
    QUASI_AD_HOC_LIMIT = datetime.timedelta(minutes=45)

+    # Operating hours of the platform.
+    SERVICE_START = 11
+    SERVICE_END = 23
+
+    # Side lengths (in meters) for which pixel grids are created.
+    # They are the basis for the aggregated demand forecasts.
    GRID_SIDE_LENGTHS = [707, 1000, 1414]

+    # Time steps (in minutes) used to aggregate the
+    # individual orders into time series.
+    TIME_STEPS = [60]
+
+    # Training horizons (in full weeks) used
+    # to train the forecasting models.
+    TRAINING_HORIZONS = [8]
+
+    # The demand forecasting methods used in the simulations.
+    FORECASTING_METHODS = ['hets', 'rtarima']
+
+    # Implementation-specific settings
+    # --------------------------------
+
    DATABASE_URI = os.getenv('DATABASE_URI')

    # The PostgreSQL schema that holds the tables with the original data.
--- a/src/urban_meal_delivery/db/grids.py
+++ b/src/urban_meal_delivery/db/grids.py
@ -43,7 +43,7 @@ class Grid(meta.Base):

    def __repr__(self) -> str:
        """Non-literal text representation."""
-        return '<{cls}: {area}>'.format(
+        return '<{cls}: {area} sqr. km>'.format(
            cls=self.__class__.__name__, area=self.pixel_area,
        )

@ -51,7 +51,7 @@ class Grid(meta.Base):
    @property
    def pixel_area(self) -> float:
        """The area of a `Pixel` on the grid in square kilometers."""
-        return (self.side_length ** 2) / 1_000_000  # noqa:WPS432
+        return round((self.side_length ** 2) / 1_000_000, 1)  # noqa:WPS432

    @classmethod
    def gridify(cls, city: db.City, side_length: int) -> db.Grid:
--- a/src/urban_meal_delivery/forecasts/init.py
+++ b/src/urban_meal_delivery/forecasts/init.py
@ -0,0 +1,3 @@
+"""Demand forecasting utilities."""
+
+from urban_meal_delivery.forecasts import timify
--- a/src/urban_meal_delivery/forecasts/timify.py
+++ b/src/urban_meal_delivery/forecasts/timify.py
@ -0,0 +1,114 @@
+"""Obtain and work with time series data."""
+
+import datetime
+
+import pandas as pd
+
+from urban_meal_delivery import config
+from urban_meal_delivery import db
+
+
+def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame:  # pragma: no cover
+    """Obtain a time series of the ad-hoc `Order` totals.
+
+    Args:
+        grid: pixel grid used to aggregate orders spatially
+        time_step: interval length (in minutes) into which orders are aggregated
+
+    Returns:
+        order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and
+            beginnings of the intervals (i.e., "start_at"s); the sole column
+            with data is "total_orders"
+    """
+    # `data` is probably missing "pixel_id"-"start_at" pairs.
+    # This happens whenever there is no demand in the `Pixel` in the given `time_step`.
+    data = pd.read_sql_query(
+        f"""-- # noqa:WPS221
+        SELECT
+            pixel_id,
+            start_at,
+            COUNT(*) AS total_orders
+        FROM (
+            SELECT
+                pixel_id,
+                placed_at_without_seconds - minutes_to_be_cut AS start_at
+            FROM (
+                SELECT
+                    pixels.pixel_id,
+                    DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,
+                    ((
+                        EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step}
+                    )::TEXT || ' MINUTES')::INTERVAL
+                        AS minutes_to_be_cut
+                FROM (
+                    SELECT
+                        id,
+                        placed_at,
+                        pickup_address_id
+                    FROM
+                        {config.CLEAN_SCHEMA}.orders
+                    INNER JOIN (
+                        SELECT
+                            id AS address_id
+                        FROM
+                            {config.CLEAN_SCHEMA}.addresses
+                        WHERE
+                            city_id = {grid.city.id}
+                    ) AS in_city
+                        ON orders.pickup_address_id = in_city.address_id
+                    WHERE
+                        ad_hoc IS TRUE
+                ) AS
+                    orders
+                INNER JOIN (
+                    SELECT
+                        address_id,
+                        pixel_id
+                    FROM
+                        {config.CLEAN_SCHEMA}.addresses_pixels
+                    WHERE
+                        grid_id = {grid.id}
+                        AND
+                        city_id = {grid.city.id} -- city_id is redundant -> sanity check
+                ) AS pixels
+                    ON orders.pickup_address_id = pixels.address_id
+            ) AS placed_at_aggregated_into_start_at
+        ) AS pixel_start_at_combinations
+        GROUP BY
+            pixel_id,
+            start_at
+        ORDER BY
+            pixel_id,
+            start_at;
+        """,
+        con=db.connection,
+        index_col=['pixel_id', 'start_at'],
+    )
+
+    if data.empty:
+        return data
+
+    # Calculate the first and last "start_at" value ...
+    start_day = data.index.levels[1].min().date()
+    start = datetime.datetime(
+        start_day.year, start_day.month, start_day.day, config.SERVICE_START,
+    )
+    end_day = data.index.levels[1].max().date()
+    end = datetime.datetime(
+        end_day.year, end_day.month, end_day.day, config.SERVICE_END,
+    )
+
+    # ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
+    # The "start_at" values must lie within the operating hours.
+    gen = (
+        (pixel_id, start_at)
+        for pixel_id in sorted(data.index.levels[0])
+        for start_at in pd.date_range(start, end, freq=f'{time_step}T')
+        if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END
+    )
+
+    # Re-index `data` filling in `0`s where there is no demand.
+    index = pd.MultiIndex.from_tuples(gen)
+    index.names = ['pixel_id', 'start_at']
+
+    return data.reindex(index, fill_value=0)