urban-meal-delivery/src/urban_meal_delivery/forecasts/timify.py

"""Obtain and work with time series data."""

import datetime

import pandas as pd

from urban_meal_delivery import config
from urban_meal_delivery import db


def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame:  # pragma: no cover
    """Obtain a time series of the ad-hoc `Order` totals.

    Args:
        grid: pixel grid used to aggregate orders spatially
        time_step: interval length (in minutes) into which orders are aggregated

    Returns:
        order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and
            beginnings of the intervals (i.e., "start_at"s); the sole column
            with data is "total_orders"
    """
    # `data` is probably missing "pixel_id"-"start_at" pairs.
    # This happens whenever there is no demand in the `Pixel` in the given `time_step`.
    data = pd.read_sql_query(
        f"""-- # noqa:WPS221
        SELECT
            pixel_id,
            start_at,
            COUNT(*) AS total_orders
        FROM (
            SELECT
                pixel_id,
                placed_at_without_seconds - minutes_to_be_cut AS start_at
            FROM (
                SELECT
                    pixels.pixel_id,
                    DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,
                    ((
                        EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step}
                    )::TEXT || ' MINUTES')::INTERVAL
                        AS minutes_to_be_cut
                FROM (
                    SELECT
                        id,
                        placed_at,
                        pickup_address_id
                    FROM
                        {config.CLEAN_SCHEMA}.orders
                    INNER JOIN (
                        SELECT
                            id AS address_id
                        FROM
                            {config.CLEAN_SCHEMA}.addresses
                        WHERE
                            city_id = {grid.city.id}
                    ) AS in_city
                        ON orders.pickup_address_id = in_city.address_id
                    WHERE
                        ad_hoc IS TRUE
                ) AS
                    orders
                INNER JOIN (
                    SELECT
                        address_id,
                        pixel_id
                    FROM
                        {config.CLEAN_SCHEMA}.addresses_pixels
                    WHERE
                        grid_id = {grid.id}
                        AND
                        city_id = {grid.city.id} -- city_id is redundant -> sanity check
                ) AS pixels
                    ON orders.pickup_address_id = pixels.address_id
            ) AS placed_at_aggregated_into_start_at
        ) AS pixel_start_at_combinations
        GROUP BY
            pixel_id,
            start_at
        ORDER BY
            pixel_id,
            start_at;
        """,
        con=db.connection,
        index_col=['pixel_id', 'start_at'],
    )

    if data.empty:
        return data

    # Calculate the first and last "start_at" value ...
    start_day = data.index.levels[1].min().date()
    start = datetime.datetime(
        start_day.year, start_day.month, start_day.day, config.SERVICE_START,
    )
    end_day = data.index.levels[1].max().date()
    end = datetime.datetime(
        end_day.year, end_day.month, end_day.day, config.SERVICE_END,
    )

    # ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
    # The "start_at" values must lie within the operating hours.
    gen = (
        (pixel_id, start_at)
        for pixel_id in sorted(data.index.levels[0])
        for start_at in pd.date_range(start, end, freq=f'{time_step}T')
        if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END
    )

    # Re-index `data` filling in `0`s where there is no demand.
    index = pd.MultiIndex.from_tuples(gen)
    index.names = ['pixel_id', 'start_at']

    return data.reindex(index, fill_value=0)
Add `aggregate_orders()` function - the function queries the database and aggregates the ad-hoc orders by pixel and time steps into a demand time series - implement "heavy" integration tests for `aggregate_orders()` - make `pandas` a package dependency - streamline the `Config` 2021-01-07 23:18:40 +01:00			`"""Obtain and work with time series data."""`

			`import datetime`

			`import pandas as pd`

			`from urban_meal_delivery import config`
			`from urban_meal_delivery import db`


			`def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame: # pragma: no cover`
			"""Obtain a time series of the ad-hoc `Order` totals.

			`Args:`
			`grid: pixel grid used to aggregate orders spatially`
			`time_step: interval length (in minutes) into which orders are aggregated`

			`Returns:`
			order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and
			`beginnings of the intervals (i.e., "start_at"s); the sole column`
			`with data is "total_orders"`
			`"""`
			# `data` is probably missing "pixel_id"-"start_at" pairs.
			# This happens whenever there is no demand in the `Pixel` in the given `time_step`.
			`data = pd.read_sql_query(`
			`f"""-- # noqa:WPS221`
			`SELECT`
			`pixel_id,`
			`start_at,`
			`COUNT(*) AS total_orders`
			`FROM (`
			`SELECT`
			`pixel_id,`
			`placed_at_without_seconds - minutes_to_be_cut AS start_at`
			`FROM (`
			`SELECT`
			`pixels.pixel_id,`
			`DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds,`
			`((`
			`EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step}`
			`)::TEXT \|\| ' MINUTES')::INTERVAL`
			`AS minutes_to_be_cut`
			`FROM (`
			`SELECT`
			`id,`
			`placed_at,`
			`pickup_address_id`
			`FROM`
			`{config.CLEAN_SCHEMA}.orders`
			`INNER JOIN (`
			`SELECT`
			`id AS address_id`
			`FROM`
			`{config.CLEAN_SCHEMA}.addresses`
			`WHERE`
			`city_id = {grid.city.id}`
			`) AS in_city`
			`ON orders.pickup_address_id = in_city.address_id`
			`WHERE`
			`ad_hoc IS TRUE`
			`) AS`
			`orders`
			`INNER JOIN (`
			`SELECT`
			`address_id,`
			`pixel_id`
			`FROM`
			`{config.CLEAN_SCHEMA}.addresses_pixels`
			`WHERE`
			`grid_id = {grid.id}`
			`AND`
			`city_id = {grid.city.id} -- city_id is redundant -> sanity check`
			`) AS pixels`
			`ON orders.pickup_address_id = pixels.address_id`
			`) AS placed_at_aggregated_into_start_at`
			`) AS pixel_start_at_combinations`
			`GROUP BY`
			`pixel_id,`
			`start_at`
			`ORDER BY`
			`pixel_id,`
			`start_at;`
			`""",`
			`con=db.connection,`
			`index_col=['pixel_id', 'start_at'],`
			`)`

			`if data.empty:`
			`return data`

			`# Calculate the first and last "start_at" value ...`
			`start_day = data.index.levels[1].min().date()`
			`start = datetime.datetime(`
			`start_day.year, start_day.month, start_day.day, config.SERVICE_START,`
			`)`
			`end_day = data.index.levels[1].max().date()`
			`end = datetime.datetime(`
			`end_day.year, end_day.month, end_day.day, config.SERVICE_END,`
			`)`

			# ... and all possible `tuple`s of "pixel_id"-"start_at" combinations.
			`# The "start_at" values must lie within the operating hours.`
			`gen = (`
			`(pixel_id, start_at)`
			`for pixel_id in sorted(data.index.levels[0])`
			`for start_at in pd.date_range(start, end, freq=f'{time_step}T')`
			`if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END`
			`)`

			# Re-index `data` filling in `0`s where there is no demand.
			`index = pd.MultiIndex.from_tuples(gen)`
			`index.names = ['pixel_id', 'start_at']`

			`return data.reindex(index, fill_value=0)`