"""Obtain and work with time series data.""" import datetime import pandas as pd from urban_meal_delivery import config from urban_meal_delivery import db def aggregate_orders(grid: db.Grid, time_step: int) -> pd.DataFrame: # pragma: no cover """Obtain a time series of the ad-hoc `Order` totals. Args: grid: pixel grid used to aggregate orders spatially time_step: interval length (in minutes) into which orders are aggregated Returns: order_totals: `DataFrame` with a `MultiIndex` of the "pixel_id"s and beginnings of the intervals (i.e., "start_at"s); the sole column with data is "total_orders" """ # `data` is probably missing "pixel_id"-"start_at" pairs. # This happens whenever there is no demand in the `Pixel` in the given `time_step`. data = pd.read_sql_query( f"""-- # noqa:WPS221 SELECT pixel_id, start_at, COUNT(*) AS total_orders FROM ( SELECT pixel_id, placed_at_without_seconds - minutes_to_be_cut AS start_at FROM ( SELECT pixels.pixel_id, DATE_TRUNC('MINUTE', orders.placed_at) AS placed_at_without_seconds, (( EXTRACT(MINUTES FROM orders.placed_at)::INTEGER % {time_step} )::TEXT || ' MINUTES')::INTERVAL AS minutes_to_be_cut FROM ( SELECT id, placed_at, pickup_address_id FROM {config.CLEAN_SCHEMA}.orders INNER JOIN ( SELECT id AS address_id FROM {config.CLEAN_SCHEMA}.addresses WHERE city_id = {grid.city.id} ) AS in_city ON orders.pickup_address_id = in_city.address_id WHERE ad_hoc IS TRUE ) AS orders INNER JOIN ( SELECT address_id, pixel_id FROM {config.CLEAN_SCHEMA}.addresses_pixels WHERE grid_id = {grid.id} AND city_id = {grid.city.id} -- city_id is redundant -> sanity check ) AS pixels ON orders.pickup_address_id = pixels.address_id ) AS placed_at_aggregated_into_start_at ) AS pixel_start_at_combinations GROUP BY pixel_id, start_at ORDER BY pixel_id, start_at; """, con=db.connection, index_col=['pixel_id', 'start_at'], ) if data.empty: return data # Calculate the first and last "start_at" value ... start_day = data.index.levels[1].min().date() start = datetime.datetime( start_day.year, start_day.month, start_day.day, config.SERVICE_START, ) end_day = data.index.levels[1].max().date() end = datetime.datetime( end_day.year, end_day.month, end_day.day, config.SERVICE_END, ) # ... and all possible `tuple`s of "pixel_id"-"start_at" combinations. # The "start_at" values must lie within the operating hours. gen = ( (pixel_id, start_at) for pixel_id in sorted(data.index.levels[0]) for start_at in pd.date_range(start, end, freq=f'{time_step}T') if config.SERVICE_START <= start_at.time().hour < config.SERVICE_END ) # Re-index `data` filling in `0`s where there is no demand. index = pd.MultiIndex.from_tuples(gen) index.names = ['pixel_id', 'start_at'] return data.reindex(index, fill_value=0)