Add OrderHistory class
- the main purpose of this class is to manage querying the order totals from the database and slice various kinds of time series out of the data - the class holds the former `aggregate_orders()` function as a method - modularize the corresponding tests - add `tests.config` with globals used when testing to provide a single source of truth for various settings
This commit is contained in:
parent
d5b3efbca1
commit
65d1632e98
6 changed files with 289 additions and 129 deletions
|
|
@ -1,6 +1,6 @@
|
|||
"""Obtain and work with time series data."""
|
||||
|
||||
import datetime
|
||||
import datetime as dt
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
|
@ -8,107 +8,140 @@ from urban_meal_delivery import config
|
|||
from urban_meal_delivery import db
|
||||
|
||||
|
||||
def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame: # pragma: no cover
|
||||
"""Obtain a time series of the ad-hoc `Order` totals.
|
||||
class OrderHistory:
|
||||
"""Generate time series from the `Order` model in the database.
|
||||
|
||||
Args:
|
||||
grid: pixel grid used to aggregate orders spatially
|
||||
time_step: interval length (in minutes) into which orders are aggregated
|
||||
|
||||
Returns:
|
||||
order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and
|
||||
beginnings of the intervals (i.e., "start_at"s); the sole column
|
||||
with data is "total_orders"
|
||||
The purpose of this class is to abstract away the managing of the order data
|
||||
in memory and the slicing the data into various kinds of time series.
|
||||
"""
|
||||
# `data` is probably missing "pixel_id"-"start_at" pairs.
|
||||
# This happens whenever there is no demand in the `Pixel` in the given `time_step`.
|
||||
data = pd.read_sql_query(
|
||||
f"""-- # noqa:WPS221
|
||||
SELECT
|
||||
pixel_id,
|
||||
start_at,
|
||||
COUNT(*) AS total_orders
|
||||
FROM (
|
||||
|
||||
def __init__(self, grid: db.Grid, time_step: int) -> None:
|
||||
"""Initialize a new `OrderHistory` object.
|
||||
|
||||
Args:
|
||||
grid: pixel grid used to aggregate orders spatially
|
||||
time_step: interval length (in minutes) into which orders are aggregated
|
||||
|
||||
# noqa:DAR401 RuntimeError
|
||||
"""
|
||||
self._grid = grid
|
||||
self._time_step = time_step
|
||||
|
||||
# Number of daily time steps must be a whole multiple of `time_step` length.
|
||||
n_daily_time_steps = (
|
||||
60 * (config.SERVICE_END - config.SERVICE_START) / time_step
|
||||
)
|
||||
if n_daily_time_steps != int(n_daily_time_steps): # pragma: no cover
|
||||
raise RuntimeError('Internal error: configuration has invalid TIME_STEPS')
|
||||
self._n_daily_time_steps = int(n_daily_time_steps)
|
||||
|
||||
# The `_data` are populated by `.aggregate_orders()`.
|
||||
self._data = None
|
||||
|
||||
@property
|
||||
def totals(self) -> pd.DataFrame:
|
||||
"""The order totals by `Pixel` and `.time_step`.
|
||||
|
||||
The returned object should not be mutated!
|
||||
|
||||
Returns:
|
||||
order_totals: a one-column `DataFrame` with a `MultiIndex` of the
|
||||
"pixel_id"s and "start_at"s (i.e., beginnings of the intervals);
|
||||
the column with data is "total_orders"
|
||||
"""
|
||||
if self._data is None:
|
||||
self._data = self.aggregate_orders()
|
||||
|
||||
return self._data
|
||||
|
||||
def aggregate_orders(self) -> pd.DataFrame: # pragma: no cover
|
||||
"""Generate and load all order totals from the database."""
|
||||
# `data` is probably missing "pixel_id"-"start_at" pairs.
|
||||
# This happens when there is no demand in the `Pixel` in the given `time_step`.
|
||||
data = pd.read_sql_query(
|
||||
f"""-- # noqa:E501,WPS221
|
||||
SELECT
|
||||
pixel_id,
|
||||
placed_at_without_seconds - minutes_to_be_cut AS start_at
|
||||
start_at,
|
||||
COUNT(*) AS total_orders
|
||||
FROM (
|
||||
SELECT
|
||||
pixels.pixel_id,
|
||||
DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,
|
||||
((
|
||||
EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step}
|
||||
)::TEXT || ' MINUTES')::INTERVAL
|
||||
AS minutes_to_be_cut
|
||||
pixel_id,
|
||||
placed_at_without_seconds - minutes_to_be_cut AS start_at
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
placed_at,
|
||||
pickup_address_id
|
||||
FROM
|
||||
{config.CLEAN_SCHEMA}.orders
|
||||
pixels.pixel_id,
|
||||
DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,
|
||||
((
|
||||
EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {self._time_step}
|
||||
)::TEXT || ' MINUTES')::INTERVAL
|
||||
AS minutes_to_be_cut
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
placed_at,
|
||||
pickup_address_id
|
||||
FROM
|
||||
{config.CLEAN_SCHEMA}.orders
|
||||
INNER JOIN (
|
||||
SELECT
|
||||
id AS address_id
|
||||
FROM
|
||||
{config.CLEAN_SCHEMA}.addresses
|
||||
WHERE
|
||||
city_id = {self._grid.city.id}
|
||||
) AS in_city
|
||||
ON orders.pickup_address_id = in_city.address_id
|
||||
WHERE
|
||||
ad_hoc IS TRUE
|
||||
) AS
|
||||
orders
|
||||
INNER JOIN (
|
||||
SELECT
|
||||
id AS address_id
|
||||
address_id,
|
||||
pixel_id
|
||||
FROM
|
||||
{config.CLEAN_SCHEMA}.addresses
|
||||
{config.CLEAN_SCHEMA}.addresses_pixels
|
||||
WHERE
|
||||
city_id = {grid.city.id}
|
||||
) AS in_city
|
||||
ON orders.pickup_address_id = in_city.address_id
|
||||
WHERE
|
||||
ad_hoc IS TRUE
|
||||
) AS
|
||||
orders
|
||||
INNER JOIN (
|
||||
SELECT
|
||||
address_id,
|
||||
pixel_id
|
||||
FROM
|
||||
{config.CLEAN_SCHEMA}.addresses_pixels
|
||||
WHERE
|
||||
grid_id = {grid.id}
|
||||
AND
|
||||
city_id = {grid.city.id} -- city_id is redundant -> sanity check
|
||||
) AS pixels
|
||||
ON orders.pickup_address_id = pixels.address_id
|
||||
) AS placed_at_aggregated_into_start_at
|
||||
) AS pixel_start_at_combinations
|
||||
GROUP BY
|
||||
pixel_id,
|
||||
start_at
|
||||
ORDER BY
|
||||
pixel_id,
|
||||
start_at;
|
||||
""",
|
||||
con=db.connection,
|
||||
index_col=['pixel_id', 'start_at'],
|
||||
)
|
||||
grid_id = {self._grid.id}
|
||||
AND
|
||||
city_id = {self._grid.city.id} -- redundant -> sanity check
|
||||
) AS pixels
|
||||
ON orders.pickup_address_id = pixels.address_id
|
||||
) AS placed_at_aggregated_into_start_at
|
||||
) AS pixel_start_at_combinations
|
||||
GROUP BY
|
||||
pixel_id,
|
||||
start_at
|
||||
ORDER BY
|
||||
pixel_id,
|
||||
start_at;
|
||||
""",
|
||||
con=db.connection,
|
||||
index_col=['pixel_id', 'start_at'],
|
||||
)
|
||||
|
||||
if data.empty:
|
||||
return data
|
||||
if data.empty:
|
||||
return data
|
||||
|
||||
# Calculate the first and last "start_at" value ...
|
||||
start_day = data.index.levels[1].min().date()
|
||||
start = datetime.datetime(
|
||||
start_day.year, start_day.month, start_day.day, config.SERVICE_START,
|
||||
)
|
||||
end_day = data.index.levels[1].max().date()
|
||||
end = datetime.datetime(
|
||||
end_day.year, end_day.month, end_day.day, config.SERVICE_END,
|
||||
)
|
||||
# Calculate the first and last "start_at" value ...
|
||||
start_day = data.index.levels[1].min().date()
|
||||
start = dt.datetime(
|
||||
start_day.year, start_day.month, start_day.day, config.SERVICE_START,
|
||||
)
|
||||
end_day = data.index.levels[1].max().date()
|
||||
end = dt.datetime(end_day.year, end_day.month, end_day.day, config.SERVICE_END)
|
||||
# ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
|
||||
# The "start_at" values must lie within the operating hours.
|
||||
gen = (
|
||||
(pixel_id, start_at)
|
||||
for pixel_id in sorted(data.index.levels[0])
|
||||
for start_at in pd.date_range(start, end, freq=f'{self._time_step}T')
|
||||
if config.SERVICE_START <= start_at.hour < config.SERVICE_END
|
||||
)
|
||||
|
||||
# ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
|
||||
# The "start_at" values must lie within the operating hours.
|
||||
gen = (
|
||||
(pixel_id, start_at)
|
||||
for pixel_id in sorted(data.index.levels[0])
|
||||
for start_at in pd.date_range(start, end, freq=f'{time_step}T')
|
||||
if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END
|
||||
)
|
||||
# Re-index `data` filling in `0`s where there is no demand.
|
||||
index = pd.MultiIndex.from_tuples(gen)
|
||||
index.names = ['pixel_id', 'start_at']
|
||||
|
||||
# Re-index `data` filling in `0`s where there is no demand.
|
||||
index = pd.MultiIndex.from_tuples(gen)
|
||||
index.names = ['pixel_id', 'start_at']
|
||||
|
||||
return data.reindex(index, fill_value=0)
|
||||
return data.reindex(index, fill_value=0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue