Add aggregate_orders() function

- the function queries the database and aggregates the ad-hoc orders
  by pixel and time steps into a demand time series
- implement "heavy" integration tests for `aggregate_orders()`
- make `pandas` a package dependency
- streamline the `Config`
This commit is contained in:
Alexander Hess 2021-01-07 23:18:40 +01:00
commit d5b3efbca1
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
10 changed files with 460 additions and 6 deletions

View file

@ -26,14 +26,38 @@ def random_schema_name() -> str:
class Config:
"""Configuration that applies in all situations."""
# Application-specific settings
# -----------------------------
# Date after which the real-life data is discarded.
CUTOFF_DAY = datetime.datetime(2017, 2, 1)
# If a scheduled pre-order is made within this
# time horizon, we treat it as an ad-hoc order.
QUASI_AD_HOC_LIMIT = datetime.timedelta(minutes=45)
# Operating hours of the platform.
SERVICE_START = 11
SERVICE_END = 23
# Side lengths (in meters) for which pixel grids are created.
# They are the basis for the aggregated demand forecasts.
GRID_SIDE_LENGTHS = [707, 1000, 1414]
# Time steps (in minutes) used to aggregate the
# individual orders into time series.
TIME_STEPS = [60]
# Training horizons (in full weeks) used
# to train the forecasting models.
TRAINING_HORIZONS = [8]
# The demand forecasting methods used in the simulations.
FORECASTING_METHODS = ['hets', 'rtarima']
# Implementation-specific settings
# --------------------------------
DATABASE_URI = os.getenv('DATABASE_URI')
# The PostgreSQL schema that holds the tables with the original data.

View file

@ -43,7 +43,7 @@ class Grid(meta.Base):
def __repr__(self) -> str:
"""Non-literal text representation."""
return '<{cls}: {area}>'.format(
return '<{cls}: {area} sqr. km>'.format(
cls=self.__class__.__name__, area=self.pixel_area,
)
@ -51,7 +51,7 @@ class Grid(meta.Base):
@property
def pixel_area(self) -> float:
"""The area of a `Pixel` on the grid in square kilometers."""
return (self.side_length ** 2) / 1_000_000 # noqa:WPS432
return round((self.side_length ** 2) / 1_000_000, 1) # noqa:WPS432
@classmethod
def gridify(cls, city: db.City, side_length: int) -> db.Grid:

View file

@ -0,0 +1,3 @@
"""Demand forecasting utilities."""
from urban_meal_delivery.forecasts import timify

View file

@ -0,0 +1,114 @@
"""Obtain and work with time series data."""
import datetime
import pandas as pd
from urban_meal_delivery import config
from urban_meal_delivery import db
def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame: # pragma: no cover
"""Obtain a time series of the ad-hoc `Order` totals.
Args:
grid: pixel grid used to aggregate orders spatially
time_step: interval length (in minutes) into which orders are aggregated
Returns:
order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and
beginnings of the intervals (i.e., "start_at"s); the sole column
with data is "total_orders"
"""
# `data` is probably missing "pixel_id"-"start_at" pairs.
# This happens whenever there is no demand in the `Pixel` in the given `time_step`.
data = pd.read_sql_query(
f"""-- # noqa:WPS221
SELECT
pixel_id,
start_at,
COUNT(*) AS total_orders
FROM (
SELECT
pixel_id,
placed_at_without_seconds - minutes_to_be_cut AS start_at
FROM (
SELECT
pixels.pixel_id,
DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,
((
EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step}
)::TEXT || ' MINUTES')::INTERVAL
AS minutes_to_be_cut
FROM (
SELECT
id,
placed_at,
pickup_address_id
FROM
{config.CLEAN_SCHEMA}.orders
INNER JOIN (
SELECT
id AS address_id
FROM
{config.CLEAN_SCHEMA}.addresses
WHERE
city_id = {grid.city.id}
) AS in_city
ON orders.pickup_address_id = in_city.address_id
WHERE
ad_hoc IS TRUE
) AS
orders
INNER JOIN (
SELECT
address_id,
pixel_id
FROM
{config.CLEAN_SCHEMA}.addresses_pixels
WHERE
grid_id = {grid.id}
AND
city_id = {grid.city.id} -- city_id is redundant -> sanity check
) AS pixels
ON orders.pickup_address_id = pixels.address_id
) AS placed_at_aggregated_into_start_at
) AS pixel_start_at_combinations
GROUP BY
pixel_id,
start_at
ORDER BY
pixel_id,
start_at;
""",
con=db.connection,
index_col=['pixel_id', 'start_at'],
)
if data.empty:
return data
# Calculate the first and last "start_at" value ...
start_day = data.index.levels[1].min().date()
start = datetime.datetime(
start_day.year, start_day.month, start_day.day, config.SERVICE_START,
)
end_day = data.index.levels[1].max().date()
end = datetime.datetime(
end_day.year, end_day.month, end_day.day, config.SERVICE_END,
)
# ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
# The "start_at" values must lie within the operating hours.
gen = (
(pixel_id, start_at)
for pixel_id in sorted(data.index.levels[0])
for start_at in pd.date_range(start, end, freq=f'{time_step}T')
if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END
)
# Re-index `data` filling in `0`s where there is no demand.
index = pd.MultiIndex.from_tuples(gen)
index.names = ['pixel_id', 'start_at']
return data.reindex(index, fill_value=0)