Merge branch 'model-section' into develop

2020-10-04 23:42:50 +02:00 · 2020-10-04 23:42:50 +02:00 · 20abf8eade
commit 20abf8eade
parent 7c203cb87c 16443ca711
26 changed files with 1354 additions and 6 deletions
--- a/paper.pdf
+++ b/paper.pdf
--- a/paper.tex
+++ b/paper.tex
@ -19,6 +19,16 @@
 \input{tex/2_lit/3_ml/4_rf}
 \input{tex/2_lit/3_ml/5_svm}
 \input{tex/3_mod/1_intro}
 \input{tex/3_mod/2_overall}
 \input{tex/3_mod/3_grid}
 \input{tex/3_mod/4_cv}
 \input{tex/3_mod/5_mase}
 \input{tex/3_mod/6_decomp}
 \input{tex/3_mod/7_models/1_intro}
 \input{tex/3_mod/7_models/2_hori}
 \input{tex/3_mod/7_models/3_vert}
 \input{tex/3_mod/7_models/4_rt}
 \input{tex/3_mod/7_models/5_ml}
 \input{tex/4_stu/1_intro}
 \input{tex/5_con/1_intro}
@ -29,6 +39,10 @@
 \appendix
 \newpage
 \input{tex/apx/tabular_ml_models}
 \newpage
 \input{tex/apx/enhanced_feats}
 \newpage
 \bibliographystyle{static/elsarticle-harv}
 \bibliography{tex/references}
--- a/static/cross_validation_gray.png
+++ b/static/cross_validation_gray.png
--- a/static/gridification_for_paris_gray.png
+++ b/static/gridification_for_paris_gray.png
--- a/static/model_inputs_gray.png
+++ b/static/model_inputs_gray.png
--- a/static/slashbox.sty
+++ b/static/slashbox.sty
@ -0,0 +1,77 @@
 % slashbox.sty by Koichi Yasuoka, May 27, 1993
 %              minor modification by Toru Sato, May 31, 1993
 \typeout{slashbox style by K.Yasuoka, May 1993.}%
 \newbox\@slashboxa
 \newbox\@slashboxb
 \newbox\@slashboxc
 \newcount\@slashboxwd
 \newcount\@slashboxht
 \newdimen\@slashsepl
 \newdimen\@slashsepr
 \def\slashbox{%
  \def\@slashboxpicture##1{%
    \put(0,0){\line(##1,1){\@slashboxwd}}%
    \put(0,\@slashboxht){\makebox(0,0)[tl]{\box\@slashboxa}}%
    \put(\@slashboxwd,0){\makebox(0,0)[br]{\box\@slashboxb}}%
  }%
  \@slashbox
 }%
 \def\backslashbox{%
  \def\@slashboxpicture##1{%
    \put(0,\@slashboxht){\line(##1,-1){\@slashboxwd}}%
    \put(0,0){\makebox(0,0)[bl]{\box\@slashboxa}}%
    \put(\@slashboxwd,\@slashboxht){\makebox(0,0)[tr]{\box\@slashboxb}}%
  }%
  \@slashbox
 }%
 \def\@slashbox{\@ifnextchar [{\@@slashbox}{\@@slashbox[0pt]}}
 \def\@@slashbox[#1]{\@ifnextchar [{\@@@slashbox[#1]}{\@@@slashbox[#1][c]}}
 \def\@@@slashbox[#1][#2]#3#4{%
 % #1: width, #2: suppression of \tabcolsep on `l', `r', or `lr' side
 % #3: left item, #4: right item
  \@slashsepl=\tabcolsep
  \@slashsepr=\tabcolsep
    \@tfor\@tempa :=#2\do{\expandafter\let
        \csname @slashsep\@tempa\endcsname=\z@}%
  \setbox\@slashboxa=\hbox{\strut\hskip\tabcolsep\shortstack[l]{#3}}%
  \setbox\@slashboxb=\hbox{\shortstack[r]{#4}\hskip\tabcolsep\strut}%
  \setbox\@slashboxa=\hbox{\raise\dp\@slashboxa\box\@slashboxa}%
  \setbox\@slashboxb=\hbox{\raise\dp\@slashboxb\box\@slashboxb}%
  \setbox\@slashboxc=\hbox{%
    \@tempdima=\wd\@slashboxa
    \advance\@tempdima by \wd\@slashboxb
    \advance\@tempdima by \@slashsepl
    \advance\@tempdima by \@slashsepr
    \@tempdimb=#1\relax%
    \ifdim\@tempdimb>\@tempdima \@tempdima=\@tempdimb\fi%
    \@tempdimb=\ht\@slashboxa
    \advance\@tempdimb by \dp\@slashboxa
    \advance\@tempdimb by \ht\@slashboxb
    \advance\@tempdimb by \dp\@slashboxb
    \@tempcnta=\@tempdima
    \@tempcntb=\@tempdimb
    \advance\@tempcnta by \@tempcntb
    \advance\@tempcnta by -1
    \divide\@tempcnta by \@tempcntb
    \ifnum\@tempcnta>6 \@tempcnta=6
      \@tempdimb=0.166666666\@tempdima
    \else
      \ifnum\@tempcnta<1 \@tempcnta=1\fi
      \@tempdima=\@tempdimb
      \multiply\@tempdima by \@tempcnta
    \fi%
    \advance\@tempdima by -\@slashsepl
    \advance\@tempdima by -\@slashsepr
    \@slashboxwd=\@tempdima
    \@slashboxht=\@tempdimb
    \@tempcntb=\@slashsepl
    \setlength{\unitlength}{1sp}%
    \begin{picture}(\@slashboxwd,\@slashboxht)(\@tempcntb,0)
    \advance\@tempdima by \@slashsepl
    \advance\@tempdima by \@slashsepr
    \@slashboxwd=\@tempdima
    \@slashboxpicture{\@tempcnta}
    \end{picture}%
  }%
  $\vcenter{\box\@slashboxc}$%
 }%
--- a/static/stl_gray.png
+++ b/static/stl_gray.png
--- a/tex/3_mod/1_intro.tex
+++ b/tex/3_mod/1_intro.tex
@ -1,8 +1,6 @@
 \section{Model Formulation}
 \label{mod}
-% temporary placeholders
+In this section, we describe how the platform's raw data are pre-processed
-\label{decomp}
+    into model inputs and how the forecasting models are built and benchmarked
-\label{f:stl}
+    against each other.
 \label{mase}
 \label{unified_cv}
--- a/tex/3_mod/2_overall.tex
+++ b/tex/3_mod/2_overall.tex
@ -0,0 +1,28 @@
 \subsection{Overall Approach}
 \label{approach_approach}
 On a conceptual level, there are three distinct aspects of the model
    development process.
 First, a pre-processing step transforms the platform's tabular order data into
    either time series in Sub-section \ref{grid} or feature matrices in
    Sub-section \ref{ml_models}.
 Second, a benchmark methodology is developed in Sub-section \ref{unified_cv}
    that compares all models on the same scale, in particular, classical
    models with ML ones.
 Concretely, the CV approach is adapted to the peculiar requirements of
    sub-daily and ordinal time series data.
 This is done to maximize the predictive power of all models into the future
    and to compare them on the same scale.
 Third, the forecasting models are described with respect to their assumptions
    and training requirements.
 Four classification dimensions are introduced:
 \begin{enumerate}
 \item \textbf{Timeliness of the Information}:
    whole-day-ahead vs. real-time forecasts
 \item \textbf{Time Series Decomposition}: raw vs. decomposed
 \item \textbf{Algorithm Type}: "classical" statistics vs. ML
 \item \textbf{Data Sources}: pure vs. enhanced (i.e., with external data)
 \end{enumerate}
 Not all of the possible eight combinations are implemented; instead, the
    models are varied along these dimensions to show different effects and
    answer the research questions.
--- a/tex/3_mod/3_grid.tex
+++ b/tex/3_mod/3_grid.tex
@ -0,0 +1,95 @@
 \subsection{Gridification, Time Tables, and Time Series Generation}
 \label{grid}
 The platform's tabular order data are sliced with respect to both location and
    time and then aggregated into time series where an observation tells
    the number of orders in an area for a time step/interval.
 Figure \ref{f:grid} shows how the orders' delivery locations are each
    matched to a square-shaped cell, referred to as a pixel, on a grid
    covering the entire service area within a city.
 This gridification step is also applied to the pickup locations separately.
 The lower-left corner is chosen at random.
 \cite{winkenbach2015} apply the same gridification idea and slice an urban
    area to model a location-routing problem, and \cite{singleton2017} portray
    it as a standard method in the field of urban analytics.
 With increasing pixel sizes, the time series exhibit more order aggregation
    with a possibly stronger demand pattern.
 On the other hand, the larger the pixels, the less valuable become the
    generated forecasts as, for example, a courier sent to a pixel
    preemptively then faces a longer average distance to a restaurant in the
    pixel.
 \begin{center}
 \captionof{figure}{Gridification for delivery locations in Paris with a pixel
                   size of $1~\text{km}^2$}
 \label{f:grid}
 \includegraphics[width=.8\linewidth]{static/gridification_for_paris_gray.png}
 \end{center}
 After gridification, the ad-hoc orders within a pixel are aggregated by their
    placement timestamps into sub-daily time steps of pre-defined lengths
    to obtain a time table as exemplified in Figure \ref{f:timetable} with
    one-hour intervals.
 \begin{center}
 \captionof{figure}{Aggregation into a time table with hourly time steps}
 \label{f:timetable}
 \begin{tabular}{|c||*{9}{c|}}
    \hline
    \backslashbox{Time}{Day} & \makebox[2em]{\ldots}
        & \makebox[3em]{Mon} & \makebox[3em]{Tue}
        & \makebox[3em]{Wed} & \makebox[3em]{Thu}
        & \makebox[3em]{Fri} & \makebox[3em]{Sat}
        & \makebox[3em]{Sun} & \makebox[2em]{\ldots} \\
    \hline
    \hline
    11:00 & \ldots & $y_{11,Mon}$ & $y_{11,Tue}$ & $y_{11,Wed}$ & $y_{11,Thu}$
                   & $y_{11,Fri}$ & $y_{11,Sat}$ & $y_{11,Sun}$ & \ldots \\
    \hline
    12:00 & \ldots & $y_{12,Mon}$ & $y_{12,Tue}$ & $y_{12,Wed}$ & $y_{12,Thu}$
                   & $y_{12,Fri}$ & $y_{12,Sat}$ & $y_{12,Sun}$ & \ldots \\
    \hline
    \ldots & \ldots & \ldots & \ldots & \ldots
           & \ldots & \ldots & \ldots & \ldots & \ldots \\
    \hline
    20:00 & \ldots & $y_{20,Mon}$ & $y_{20,Tue}$ & $y_{20,Wed}$ & $y_{20,Thu}$
                   & $y_{20,Fri}$ & $y_{20,Sat}$ & $y_{20,Sun}$ & \ldots \\
    \hline
    21:00 & \ldots & $y_{21,Mon}$ & $y_{21,Tue}$ & $y_{21,Wed}$ & $y_{21,Thu}$
                   & $y_{21,Fri}$ & $y_{21,Sat}$ & $y_{21,Sun}$ & \ldots \\
    \hline
    \ldots & \ldots & \ldots & \ldots & \ldots
           & \ldots & \ldots & \ldots & \ldots & \ldots \\
    \hline
 \end{tabular}
 \end{center}
 \
 Consequently, each $y_{t,d}$ in Figure \ref{f:timetable} is the number of
    all orders within the pixel for the time of day $t$ and day of week
    $d$ ($y_t$ and $y_{t,d}$ are the same but differ in that the latter
    acknowledges a 2D view).
 The same trade-off as with gridification applies:
 The shorter the interval, the weaker is the demand pattern to be expected in
    the time series due to less aggregation while longer intervals lead to
    less usable forecasts.
 We refer to time steps by their start time, and their number per day, $H$,
    is constant.
 Given a time table as in Figure \ref{f:timetable} there are two ways to
    generate a time series by slicing:
 \begin{enumerate}
    \item \textbf{Horizontal View}:
    Take only the order counts for a given time of the day
    \item \textbf{Vertical View}:
    Take all order counts and remove the double-seasonal pattern induced
    by the weekday and time of the day with decomposition
 \end{enumerate}
 Distinct time series are retrieved by iterating through the time tables either
    horizontally or vertically in increments of a single time step.
 Another property of a generated time series is its length, which, following
    the next sub-section, can be interpreted as the sum of the production
    training set and the test day.
 In summary, a distinct time series is generated from the tabular order data
    based on a configuration of parameters for the dimensions pixel size,
    number of daily time steps $H$, shape (horizontal vs. vertical), length,
    and the time step to be predicted.
--- a/tex/3_mod/4_cv.tex
+++ b/tex/3_mod/4_cv.tex
@ -0,0 +1,86 @@
 \subsection{Unified Cross-Validation and Training, Validation, and Test Sets}
 \label{unified_cv}
 The standard $k$-fold CV, which assumes no structure in the individual
    features of the samples, as shown in $\mat{X}$ above, is adapted to the
    ordinal character of time series data:
 A model must be evaluated on observations that occurred strictly after the
    ones used for training as, otherwise, the model knows about the future.
 Furthermore, some models predict only a single to a few time steps before
    being retrained, while others predict an entire day without retraining
    (cf., Sub-section \ref{ml_models}).
 Consequently, we must use a unified time interval wherein all forecasts are
    made first before the entire interval is evaluated.
 As whole days are the longest prediction interval for models without
    retraining, we choose that as the unified time interval.
 In summary, our CV methodology yields a distinct best model per pixel and day
    to be forecast.
 Whole days are also practical for managers who commonly monitor, for example,
    the routing and thus the forecasting performance on a day-to-day basis.
 Our methodology assumes that the models are trained at least once per day.
 As we create operational forecasts into the near future in this paper,
    retraining all models with the latest available data is a logical step.
 \begin{center}
 \captionof{figure}{Training, validation, and test sets
                   during cross validation}
 \label{f:cv}
 \includegraphics[width=.8\linewidth]{static/cross_validation_gray.png}
 \end{center}
 The training, validation, and test sets are defined as follows.
 To exemplify the logic, we refer to Figure \ref{f:cv} that shows the calendar
    setup (i.e., weekdays on the x-axis) for three days $T_1$, $T_2$, and
    $T_3$ (shown in dark gray) for which we generate forecasts.
 Each of these days is, by definition, a test day, and the test set comprises
    all time series, horizontal or vertical, whose last observation lies on
    that day.
 With an assumed training horizon of three weeks, the 21 days before each of
    the test days constitute the corresponding training sets (shown in lighter
    gray on the same rows as $T_1$, $T_2$, and $T_3$).
 There are two kinds of validation sets, depending on the decision to be made.
 First, if a forecasting method needs parameter tuning, the original training
    set is divided into as many equally long series as validation days are
    needed to find stable parameters.
 The example shows three validation days per test day named $V_n$ (shown
    in darker gray below each test day).
 The $21 - 3 = 18$ preceding days constitute the training set corresponding to
    a validation day.
 To obtain the overall validation error, the three errors are averaged.
 We call these \textit{inner} validation sets because they must be repeated
    each day to re-tune the parameters and because the involved time series
    are true subsets of the original series.
 Second, to find the best method per day and pixel, the same averaging logic
    is applied on the outer level.
 For example, if we used two validation days to find the best method for $T_3$,
    we would average the errors of $T_1$ and $T_2$ for each method and select
    the winner; then, $T_1$ and $T_2$ constitute an \textit{outer} validation
    set.
 Whereas the number of inner validation days is method-specific and must be
    chosen before generating any test day forecasts in the first place, the
    number of outer validation days may be varied after the fact and is
    determined empirically as we show in Section \ref{stu}.
 Our unified CV approach is also optimized for large-scale production settings,
    for example, at companies like Uber.
 As \cite{bell2018} note, there is a trade-off as to when each of the
    inner time series in the example begins.
 While the forecasting accuracy likely increases with more training days,
    supporting inner series with increasing lengths, cutting the series
    to the same length allows caching the forecasts and errors.
 In the example, $V_3$, $V_5$, and $V_7$, as well as $V_6$ and $V_8$ are
    identical despite belonging to different inner validation sets.
 Caching is also possible on the outer level when searching for an optimal
    number of validation days for model selection.
 We achieved up to 80\% cache hit ratios in our implementation in the
    empirical study, thereby saving computational resources by the same
    amount.
 Lastly, we assert that our suggested CV, because of its being unified
    around whole test days and usage of fix-sized time series, is also
    suitable for creating consistent learning curves and, thus, answering
    \textbf{Q3} on the relationship between forecast accuracy and amount of
    historic data:
 We simply increase the length of the outer training set holding the test day
    fixed.
 Thus, independent of a method's need for parameter tuning, all methods have
    the same demand history available for each test day forecast.
--- a/tex/3_mod/5_mase.tex
+++ b/tex/3_mod/5_mase.tex
@ -0,0 +1,87 @@
 \subsection{Accuracy Measures}
 \label{mase}
 Choosing an error measure for both model selection and evaluation is not
    straightforward when working with intermittent demand, as shown, for
    example, by \cite{syntetos2005}, and one should understand the trade-offs
    between measures.
 \cite{hyndman2006} provide a study of measures with real-life data taken from
    the popular M3-competition and find that most standard measures degenerate
    under many scenarios.
 They also provide a classification scheme for which we summarize the main
    points as they apply to the UDP case:
 \begin{enumerate}
 \item \textbf{Scale-dependent Errors}:
 The error is reported in the same unit as the raw data.
 Two popular examples are the root mean square error (RMSE) and mean absolute
    error (MAE).
 They may be used for model selection and evaluation within a pixel, and are
    intuitively interpretable; however, they may not be used to compare errors
    of, for example, a low-demand pixel (e.g., at the UDP's service
    boundary) with that of a high-demand pixel (e.g., downtown).
 \item \textbf{Percentage Errors}:
 The error is derived from the percentage errors of individual forecasts per
    time step, and is also intuitively interpretable.
 A popular example is the mean absolute percentage error (MAPE) that is the
    primary measure in most forecasting competitions.
 Whereas such errors could be applied both within and across pixels, they
    cannot be calculated reliably for intermittent demand.
 If only one time step exhibits no demand, the result is a divide-by-zero
    error.
 This often occurs even in high-demand pixels due to the slicing.
 \item \textbf{Relative Errors}:
 A workaround is to calculate a scale-dependent error for the test day and
    divide it by the same measure calculated with forecasts of a simple
    benchmark method (e.g., na\"{i}ve method).
 An example could be
    $\text{RelMAE} = \text{MAE} / \text{MAE}_\text{bm}$.
 Nevertheless, even simple methods create (near-)perfect forecasts, and then
    $\text{MAE}_\text{bm}$ becomes (close to) $0$.
 These numerical instabilities occurred so often in our studies that we argue
    against using such measures.
 \item \textbf{Scaled Errors}:
 \cite{hyndman2006} contribute this category and introduce the mean absolute
    scaled error (\gls{mase}).
 It is defined as the MAE from the actual forecasting method on the test day
    (i.e., "out-of-sample") divided by the MAE from the (seasonal) na\"{i}ve
    method on the entire training set (i.e., "in-sample").
 A MASE of $1$ indicates that a forecasting method has the same accuracy
    on the test day as the (seasonal) na\"{i}ve method applied on a longer
    horizon, and lower values imply higher accuracy.
 Within a pixel, its results are identical to the ones obtained with MAE.
 Also, we acknowledge recent publications, for example, \cite{prestwich2014} or
    \cite{kim2016}, showing other ways of tackling the difficulties mentioned.
 However, only the MASE provided numerically stable results for all
    forecasts in our study.
 \end{enumerate}
 Consequently, we use the MASE with a seasonal na\"{i}ve benchmark as the
    primary measure in this paper.
 With the previously introduced notation, it is defined as follows:
 $$
 \text{MASE}
 :=
 \frac{\text{MAE}_{\text{out-of-sample}}}{\text{MAE}_{\text{in-sample}}}
 =
 \frac{\text{MAE}_{\text{forecasts}}}{\text{MAE}_{\text{training}}}
 =
 \frac{\frac{1}{H} \sum_{h=1}^H |y_{T+h} - \hat{y}_{T+h}|}
     {\frac{1}{T-k} \sum_{t=k+1}^T |y_{t} - y_{t-k}|}
 $$
 The denominator can only become $0$ if the seasonal na\"{i}ve benchmark makes
    a perfect forecast on each day in the training set except the first seven
    days, which never happened in our case study involving hundreds of
    thousands of individual model trainings.
 Further, as per the discussion in the subsequent Section \ref{decomp}, we also
    calculate peak-MASEs where we leave out the time steps of non-peak times
    from the calculations.
 For this analysis, we define all time steps that occur at lunch (i.e., noon to
    2 pm) and dinner time (i.e., 6 pm to 8 pm) as peak.
 As time steps in non-peak times typically average no or very low order counts,
    a UDP may choose to not actively forecast these at all and be rather
    interested in the accuracies of forecasting methods during peaks only.
 We conjecture that percentage error measures may be usable for UDPs facing a
    higher overall demand with no intra-day down-times in between but have to
    leave that to a future study.
 Yet, even with high and steady demand, divide-by-zero errors are likely to
    occur.
--- a/tex/3_mod/6_decomp.tex
+++ b/tex/3_mod/6_decomp.tex
@ -0,0 +1,76 @@
 \subsection{Time Series Decomposition}
 \label{decomp}
 Concerning the time table in Figure \ref{f:timetable}, a seasonal demand
    pattern is inherent to both horizontal and vertical time series.
 First, the weekday influences if people eat out or order in with our partner
    receiving more orders on Thursday through Saturday than the other four
    days.
 This pattern is part of both types of time series.
 Second, on any given day, demand peaks occur around lunch and dinner times.
 This only regards vertical series.
 Statistical analyses show that horizontally sliced time series indeed exhibit
    a periodicity of $k=7$, and vertically sliced series only yield a seasonal
    component with a regular pattern if the periodicity is set to the product
    of the number of weekdays and the daily time steps indicating a distinct
    intra-day pattern per weekday.
 Figure \ref{f:stl} shows three exemplary STL decompositions for a
    $1~\text{km}^2$ pixel and a vertical time series with 60-minute time steps
    (on the x-axis) covering four weeks:
 With the noisy raw data $y_t$ on the left, the seasonal and trend components,
    $s_t$ and $t_t$, are depicted in light and dark gray for increasing $ns$
    parameters.
 The plots include (seasonal) na\"{i}ve forecasts for the subsequent test day
    as dotted lines.
 The remainder components $r_t$ are not shown for conciseness.
 The periodicity is set to $k = 7 * 12 = 84$ as our industry partner has $12$
    opening hours per day.
 \begin{center}
 \captionof{figure}{STL decompositions for a medium-demand pixel with hourly
                   time steps and periodicity $k=84$}
 \label{f:stl}
 \includegraphics[width=.95\linewidth]{static/stl_gray.png}
 \end{center}
 As described in Sub-section \ref{stl}, with $k$ being implied by the
    application, at the very least, the length of the seasonal smoothing
    window, represented by the $ns$ parameter, must be calibrated by the
    forecaster:
 It controls how many past observations go into each smoothened $s_t$.
 Many practitioners, however, skip this step and set $ns$ to a big number, for
    example, $999$, then referred to as "periodic."
 For the other parameters, it is common to use the default values as
    specified in \cite{cleveland1990}.
 The goal is to find a decomposition with a regular pattern in $s_t$.
 In Figure \ref{f:stl}, this is not true for $ns=7$ where, for
    example, the four largest bars corresponding to the same time of day a
    week apart cannot be connected by an approximately straight line.
 On the contrary, a regular pattern in the most extreme way exists for
    $ns=999$, where the same four largest bars are of the same height.
 This observation holds for each time step of the day.
 For $ns=11$, $s_t$ exhibits a regular pattern whose bars adapt over time:
 The pattern is regular as bars corresponding to the same time of day can be
    connected by approximately straight lines, and it is adaptive as these
    lines are not horizontal.
 The trade-off between small and large values for $ns$ can thus be interpreted
    as allowing the average demand during peak times to change over time:
 If demand is intermittent at non-peak times, it is reasonable to expect the
    bars to change over time as only the relative differences between peak and
    non-peak times impact the bars' heights with the seasonal component being
    centered around $0$.
 To confirm the goodness of a decomposition statistically, one way is to verify
    that $r_t$ can be modeled as a typical error process like white noise
    $\epsilon_t$.
 However, we suggest an alternative way of calibrating the STL method in an
    automated fashion based on our unified CV approach.
 As hinted at in Figure \ref{f:stl}, we interpret an STL decomposition as a
    forecasting method on its own by just adding the (seasonal) na\"{i}ve
    forecasts for $s_t$ and $t_t$ and predicting $0$ for $r_t$.
 Then, the $ns$ parameter is tuned just like a parameter for an ML model.
 To the best of our knowledge, this has not yet been proposed before.
 Conceptually, forecasting with the STL method can be viewed as a na\"{i}ve
    method with built-in smoothing, and it outperformed all other
    benchmark methods in all cases.
--- a/tex/3_mod/7_models/1_intro.tex
+++ b/tex/3_mod/7_models/1_intro.tex
@ -0,0 +1,20 @@
 \subsection{Forecasting Models}
 \label{models}
 This sub-section describes the concrete models in our study.
 Figure \ref{f:inputs} shows how we classify them into four families with
    regard to the type of the time series, horizontal or vertical, and the
    moment at which a model is trained:
 Solid lines indicate that the corresponding time steps lie before the
    training, and dotted lines show the time horizon predicted by a model.
 For conciseness, we only show the forecasts for one test day.
 The setup is the same for each inner validation day.
 \
 \begin{center}
 \captionof{figure}{Classification of the models by input type and training
                   moment}
 \label{f:inputs}
 \includegraphics[width=.95\linewidth]{static/model_inputs_gray.png}
 \end{center}
--- a/tex/3_mod/7_models/2_hori.tex
+++ b/tex/3_mod/7_models/2_hori.tex
@ -0,0 +1,42 @@
 \subsubsection{Horizontal and Whole-day-ahead Forecasts.}
 \label{hori}
 The upper-left in Figure \ref{f:inputs} illustrates the simplest way to
    generate forecasts for a test day before it has started:
 For each time of the day, the corresponding horizontal slice becomes the input
    for a model.
 With whole days being the unified time interval, each model is trained $H$
    times, providing a one-step-ahead forecast.
 While it is possible to have models of a different type be selected per time
    step, that did not improve the accuracy in the empirical study.
 As the models in this family do not include the test day's demand data in
    their training sets, we see them as benchmarks to answer \textbf{Q4},
    checking if a UDP can take advantage of real-time information.
 The models in this family are as follows; we use prefixes, such as "h" here,
    when methods are applied in other families as well:
 \begin{enumerate}
 \item \textit{\gls{naive}}:
          Observation from the same time step one week prior
 \item \textit{\gls{trivial}}:
          Predict $0$ for all time steps
 \item \textit{\gls{hcroston}}:
          Intermittent demand method introduced by \cite{croston1972}
 \item \textit{\gls{hholt}},
      \textit{\gls{hhwinters}},
      \textit{\gls{hses}},
      \textit{\gls{hsma}}, and
      \textit{\gls{htheta}}:
          Exponential smoothing without calibration
 \item \textit{\gls{hets}}:
          ETS calibrated as described by \cite{hyndman2008b}
 \item \textit{\gls{harima}}:
          ARIMA calibrated as described by \cite{hyndman2008a}
 \end{enumerate}
 \textit{naive} and \textit{trivial} provide an absolute benchmark for the
    actual forecasting methods.
 \textit{hcroston} is often mentioned in the context of intermittent demand;
    however, the method did not perform well at all.
 Besides \textit{hhwinters} that always fits a seasonal component, the
    calibration heuristics behind \textit{hets} and \textit{harima} may do so
    as well.
 With $k=7$, an STL decomposition is unnecessary here.
--- a/tex/3_mod/7_models/3_vert.tex
+++ b/tex/3_mod/7_models/3_vert.tex
@ -0,0 +1,39 @@
 \subsubsection{Vertical and Whole-day-ahead Forecasts without Retraining.}
 \label{vert}
 The upper-right in Figure \ref{f:inputs} shows an alternative way to
    generate forecasts for a test day before it has started:
 First, a seasonally-adjusted time series $a_t$ is obtained from a vertical
    time series by STL decomposition.
 Then, the actual forecasting model, trained on $a_t$, makes an $H$-step-ahead
    prediction.
 Lastly, we add the $H$ seasonal na\"{i}ve forecasts for the seasonal component
    $s_t$ to them to obtain the actual predictions for the test day.
 Thus, only one training is required per model type, and no real-time data is
    used.
 By decomposing the raw time series, all long-term patterns are assumed to be
    in the seasonal component $s_t$, and $a_t$ only contains the level with
    a potential trend and auto-correlations.
 The models in this family are:
 \begin{enumerate}
 \item \textit{\gls{fnaive}},
      \textit{\gls{pnaive}}:
          Sum of STL's trend and seasonal components' na\"{i}ve forecasts
 \item \textit{\gls{vholt}},
      \textit{\gls{vses}}, and
      \textit{\gls{vtheta}}:
          Exponential smoothing without calibration and seasonal
                       fit
 \item \textit{\gls{vets}}:
          ETS calibrated as described by \cite{hyndman2008b}
 \item \textit{\gls{varima}}:
          ARIMA calibrated as described by \cite{hyndman2008a}
 \end{enumerate}
 As mentioned in Sub-section \ref{unified_cv}, we include the sum of the
    (seasonal) na\"{i}ve forecasts of the STL's trend and seasonal components
    as forecasts on their own:
 For \textit{fnaive}, we tune the "flexible" $ns$ parameter, and for
    \textit{pnaive}, we set it to a "periodic" value.
 Thus, we implicitly assume that there is no signal in the remainder $r_t$, and
    predict $0$ for it.
 \textit{fnaive} and \textit{pnaive} are two more simple benchmarks.
--- a/tex/3_mod/7_models/4_rt.tex
+++ b/tex/3_mod/7_models/4_rt.tex
@ -0,0 +1,22 @@
 \subsubsection{Vertical and Real-time Forecasts with Retraining.}
 \label{rt}
 The lower-left in Figure \ref{f:inputs} shows how models trained on vertical
    time series are extended with real-time order data as it becomes available
    during a test day:
 Instead of obtaining an $H$-step-ahead forecast, we retrain a model after
    every time step and only predict one step.
 The remainder is as in the previous sub-section, and the models are:
 \begin{enumerate}
 \item \textit{\gls{rtholt}},
      \textit{\gls{rtses}}, and
      \textit{\gls{rttheta}}:
          Exponential smoothing without calibration and seasonal fit
 \item \textit{\gls{rtets}}:
          ETS calibrated as described by \cite{hyndman2008b}
 \item \textit{\gls{rtarima}}:
          ARIMA calibrated as described by \cite{hyndman2008a}
 \end{enumerate}
 Retraining \textit{fnaive} and \textit{pnaive} did not increase accuracy, and
    thus we left them out.
 A downside of this family is the significant increase in computing costs.
--- a/tex/3_mod/7_models/5_ml.tex
+++ b/tex/3_mod/7_models/5_ml.tex
@ -0,0 +1,54 @@
 \subsubsection{Vertical and Real-time Forecasts without Retraining.}
 \label{ml_models}
 The lower-right in Figure \ref{f:inputs} shows how ML models take
    real-time order data into account without retraining.
 Based on the seasonally-adjusted time series $a_t$, we employ the feature
    matrix and label vector representations from Sub-section \ref{learning}
    and set $n$ to the number of daily time steps, $H$, to cover all potential
    auto-correlations.
 The ML models are trained once before a test day starts.
 For training, the matrix and vector are populated such that $y_T$ is set to
    the last time step of the day before the forecasts, $a_T$.
 As the splitting during CV is done with whole days, the \gls{ml} models are
    trained with training sets consisting of samples from all times of a day
    in an equal manner.
 Thus, the ML models learn to predict each time of the day.
 For prediction on a test day, the $H$ observations preceding the time
    step to be forecast are used as the input vector after seasonal
    adjustment.
 As a result, real-time data are included.
 The models in this family are:
 \begin{enumerate}
 \item \textit{\gls{vrfr}}: RF trained on the matrix as described
 \item \textit{\gls{vsvr}}: SVR trained on the matrix as described
 \end{enumerate}
 We tried other ML models such as gradient boosting machines but found
    only RFs and SVRs to perform well in our study.
 In the case of gradient boosting machines, this is to be expected as they are
    known not to perform well in the presence of high noise - as is natural
    with low count data - as shown, for example, by \cite{ma2018} or
    \cite{mason2000}.
 Also, deep learning methods are not applicable as the feature matrices only
    consist of several hundred to thousands of rows (cf., Sub-section
    \ref{params}).
 In \ref{tabular_ml_models}, we provide an alternative feature matrix
    representation that exploits the two-dimensional structure of time tables
    without decomposing the time series.
 In \ref{enhanced_feats}, we show how feature matrices are extended
    to include predictors other than historical order data.
 However, to answer \textbf{Q5} already here, none of the external data sources
    improves the results in our study.
 Due to the high number of time series in our study, to investigate why
    no external sources improve the forecasts, we must us some automated
    approach to analyzing individual time series.
 \cite{barbour2014} provide a spectral density estimation approach, called
    the Shannon entropy, that measures the signal-to-noise ratio in a
    database with a number normalized between 0 and 1 where lower values
    indicate a higher signal-to-noise ratio.
 We then looked at averages of the estimates on a daily level per pixel and
    find that including any of the external data sources from
    \ref{enhanced_feats} always leads to significantly lower signal-to-noise
    ratios.
 Thus, we conclude that at least for the demand faced by our industry partner
    the historical data contains all of the signal.
--- a/tex/4_stu/1_intro.tex
+++ b/tex/4_stu/1_intro.tex
@ -1,2 +1,5 @@
 \section{Empirical Study: A Meal Delivery Platform in Europe}
 \label{stu}
 % temporary placeholder
 \label{params}
--- a/tex/apx/case_study.tex
+++ b/tex/apx/case_study.tex
@ -0,0 +1,54 @@
 \section{Raw Order Data in the Case Study}
 \label{dataset}
 The raw data for the empirical study in Section \ref{stu} was provided by a
    meal delivery platform operating in five cities in France in 2016.
 The platform received a total of 686,385 orders distributed as follows:
 \
 \begin{center}
 \begin{tabular}{llr}
    \hline
 	\thead{City} & \thead{Launch Day} & \thead{Orders} \\
 	\hline
    Bordeaux & July 18 & 64,012 \\
    Lille & October 30 & 14,362 \\
    Lyon & February 21 & 214,635 \\
    Nantes & October 31 & 12,900 \\
    Paris & March 7 & 380,476 \\
 \end{tabular}
 \end{center}
 \
 The part of the database relevant for forecasting can be thought of as one
    table per city, where each row represents one order and consists of the
    following groups of columns:
 \begin{enumerate}
 \item \textbf{Restaurant Data}
    \begin{enumerate}
    \item unique ID and name
    \item pickup location as latitude-longitude pair
    \end{enumerate}
 \item \textbf{Customer Data}
    \begin{enumerate}
    \item unique ID, name, and phone number
    \item delivery location as latitude-longitude pair (mostly physical
 	      addresses but also public spots)
 	\end{enumerate}
 \item \textbf{Timestamps}
    \begin{enumerate}
    \item placement via the smartphone app
    \item fulfillment workflow (pickup, delivery, cancellation, re-deliveries)
    \end{enumerate}
 \item \textbf{Courier Data}
    \begin{enumerate}
    \item unique ID, name, and phone number
    \item shift data (begin, breaks, end)
    \item average speed
    \end{enumerate}
 \item \textbf{Order Details}
    \begin{enumerate}
    \item meals and drinks
    \item prices and discounts granted
    \end{enumerate}
 \end{enumerate}
--- a/tex/apx/enhanced_feats.tex
+++ b/tex/apx/enhanced_feats.tex
@ -0,0 +1,121 @@
 \section{Enhancing Forecasting Models with External Data}
 \label{enhanced_feats}
 In this appendix, we show how the feature matrix in Sub-section
    \ref{ml_models} can be extended with features other than historical order
    data.
 Then, we provide an overview of what external data we tried out as predictors
    in our empirical study.
 \subsection{Enhanced Feature Matrices}
 Feature matrices can naturally be extended by appending new feature columns
    $x_{t,f}$ or $x_f$ on the right where the former represent predictors
    changing throughout a day and the latter being static either within a
    pixel or across a city.
 $f$ refers to an external predictor variable, such as one of the examples
    listed below.
 In the SVR case, the columns should be standardized before fitting as external
    predictors are most likely on a different scale than the historic order
    data.
 Thus, for a matrix with seasonally-adjusted order data $a_t$ in it, an
    enhanced matrix looks as follows:
 $$
 \vec{y}
 =
 \begin{pmatrix}
    a_T \\
    a_{T-1} \\
    \dots \\
    a_{H+1}
 \end{pmatrix}
 ~~~~~
 \mat{X}
 =
 \begin{bmatrix}
    a_{T-1}         & a_{T-2} & \dots & a_{T-H}     & ~~~
        & x_{T,A}   & \dots   & x_{B} & \dots \\
    a_{T-2}         & a_{T-3} & \dots & a_{T-(H+1)} & ~~~
        & x_{T-1,A} & \dots   & x_{B} & \dots \\
    \dots           & \dots   & \dots & \dots       & ~~~
        & \dots     & \dots   & \dots & \dots \\
    a_H             & a_{H-1} & \dots & a_1         & ~~~
        & x_{H+1,A} & \dots   & x_{B} & \dots
 \end{bmatrix}
 $$
 \
 Similarly, we can also enhance the tabular matrices from
    \ref{tabular_ml_models}.
 The same comments as for their pure equivalents in Sub-section \ref{ml_models}
    apply, in particular, that ML models trained with an enhanced matrix can
    process real-time data without being retrained.
 \subsection{External Data in the Empirical Study}
 \label{external_data}
 In the empirical study, we tested four groups of external features that we
    briefly describe here.
 \vskip 0.1in
 \textbf{Calendar Features}:
 \begin{itemize}
    \item Time of day (as synthesized integers: e.g., 1,050 for 10:30 am,
                       or 1,600 for 4 pm)
    \item Day of week (as one-hot encoded booleans)
    \item Work day or not (as booleans)
 \end{itemize}
 \vskip 0.1in
 \textbf{Features derived from the historical Order Data}:
 \begin{itemize}
    \item Number of pre-orders for a time step (as integers)
    \item 7-day SMA of the percentages of discounted orders (as percentages):
          The platform is known for running marketing campaigns aimed at
          first-time customers at irregular intervals. Consequently, the
          order data show a wave-like pattern of coupons redeemed when looking
          at the relative share of discounted orders per day.
 \end{itemize}
 \vskip 0.1in
 \textbf{Neighborhood Features}:
 \begin{itemize}
    \item Ambient population (as integers) as obtained from the ORNL LandScan
          database
    \item Number of active platform restaurants (as integers)
    \item Number of overall restaurants, food outlets, retailers, and other
          businesses (as integers) as obtained from the Google Maps and Yelp
          web services
 \end{itemize}
 \vskip 0.1in
 \textbf{Real-time Weather} (raw data obtained from IBM's
                            Wunderground database):
 \begin{itemize}
    \item Absolute temperature, wind speed, and humidity
          (as decimals and percentages)
    \item Relative temperature with respect to 3-day and 7-day historical
          means (as decimals)
    \item Day vs. night defined by sunset (as booleans)
    \item Summarized description (as indicators $-1$, $0$, and $+1$)
    \item Lags of the absolute temperature and the summaries covering the
          previous three hours
 \end{itemize}
 \vskip 0.1in
 Unfortunately, we must report that none of the mentioned external data
    improved the accuracy of the forecasts.
 Some led to models overfitting the data, which could not be regulated.
 Manual tests revealed that real-time weather data are the most promising
    external source.
 Nevertheless, the data provided by IBM's Wunderground database originate from
    weather stations close to airports, which implies that we only have the
    same aggregate weather data for the entire city.
 If weather data is available on a more granular basis in the future, we see
    some potential for exploitation.
--- a/tex/apx/peak_results.tex
+++ b/tex/apx/peak_results.tex
@ -0,0 +1,261 @@
 \section{Forecasting Accuracies during Peak Times}
 \label{peak_results}
 This appendix shows all result tables from the main text with the MASE
    averages calculated from time steps within peak times.
 Peaks are the times of the day where the typical customer has a lunch or
    dinner meal and defined to be either from 12 pm to 2 pm or from 6 pm to
    8 pm.
 While the exact decimals of the MASEs differ from the ones in the main
    text, the relative ranks of the forecasting methods are the same except in
    rare cases.
 \begin{center}
 \captionof{table}{Top-3 models by training weeks and average demand
                  ($1~\text{km}^2$ pixel size, 60-minute time steps)}
 \label{t:results:a}
 \begin{tabular}{|c|c|*{12}{c|}}
 \hline
 \multirow{3}{*}{\rotatebox{90}{\thead{Training}}}
    & \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}}
    & \multicolumn{3}{c|}{\thead{No Demand}}
    & \multicolumn{3}{c|}{\thead{Low Demand}}
    & \multicolumn{3}{c|}{\thead{Medium Demand}}
    & \multicolumn{3}{c|}{\thead{High Demand}} \\
 ~ & ~
    & \multicolumn{3}{c|}{(0 - 2.5)}
    & \multicolumn{3}{c|}{(2.5 - 10)}
    & \multicolumn{3}{c|}{(10 - 25)}
    & \multicolumn{3}{c|}{(25 - $\infty$)} \\
 \cline{3-14}
 ~ & ~
    & Method & MASE & $n$
    & Method & MASE & $n$
    & Method & MASE & $n$
    & Method & MASE & $n$ \\
 \hline \hline
 \multirow{3}{*}{3} & 1
    & \textbf{\textit{trivial}}
        & 0.794 & \multirow{3}{*}{\rotatebox{90}{4586}}
    & \textbf{\textit{hsma}}
        & 0.817 & \multirow{3}{*}{\rotatebox{90}{2975}}
    & \textbf{\textit{hsma}}
        & 0.838 & \multirow{3}{*}{\rotatebox{90}{2743}}
    & \textbf{\textit{rtarima}}
        & 0.871 & \multirow{3}{*}{\rotatebox{90}{2018}} \\
 ~ & 2
    & \textit{hsma}    & 0.808 & ~
    & \textit{hses}    & 0.847 & ~
    & \textit{hses}    & 0.851 & ~
    & \textit{rtses}   & 0.872 & ~ \\
 ~ & 3
    & \textit{pnaive}  & 0.938 & ~
    & \textit{hets}    & 0.848 & ~
    & \textit{hets}    & 0.853 & ~
    & \textit{rtets}   & 0.874 & ~ \\
 \hline
 \multirow{3}{*}{4} & 1
    & \textbf{\textit{trivial}}
        & 0.791 & \multirow{3}{*}{\rotatebox{90}{4532}}
    & \textbf{\textit{hsma}}
        & 0.833 & \multirow{3}{*}{\rotatebox{90}{3033}}
    & \textbf{\textit{hsma}}
        & 0.839 & \multirow{3}{*}{\rotatebox{90}{2687}}
    & \textbf{\textit{vrfr}}
        & 0.848 & \multirow{3}{*}{\rotatebox{90}{2016}} \\
 ~ & 2
    & \textit{hsma}             & 0.794 & ~
    & \textit{hses}             & 0.838 & ~
    & \textit{hses}             & 0.847 & ~
    & \textbf{\textit{rtarima}} & 0.851 & ~ \\
 ~ & 3
    & \textit{pnaive}  & 0.907 & ~
    & \textit{hets}    & 0.841 & ~
    & \textit{hets}    & 0.851 & ~
    & \textit{rtses}   & 0.857 & ~ \\
 \hline
 \multirow{3}{*}{5} & 1
    & \textbf{\textit{trivial}}
        & 0.782 & \multirow{3}{*}{\rotatebox{90}{4527}}
    & \textbf{\textit{hsma}}
        & 0.844 & \multirow{3}{*}{\rotatebox{90}{3055}}
    & \textbf{\textit{hsma}}
        & 0.841 & \multirow{3}{*}{\rotatebox{90}{2662}}
    & \textbf{\textit{vrfr}}
        & 0.849 & \multirow{3}{*}{\rotatebox{90}{2019}} \\
 ~ & 2
    & \textit{hsma}             & 0.802 & ~
    & \textit{hses}             & 0.851 & ~
    & \textit{hets}             & 0.844 & ~
    & \textbf{\textit{rtarima}} & 0.851 & ~ \\
 ~ & 3
    & \textit{pnaive}  & 0.888 & ~
    & \textit{hets}    & 0.863 & ~
    & \textit{hses}    & 0.845 & ~
    & \textit{vsvr}    & 0.853 & ~ \\
 \hline
 \multirow{3}{*}{6} & 1
    & \textbf{\textit{trivial}}
        & 0.743 & \multirow{3}{*}{\rotatebox{90}{4470}}
    & \textbf{\textit{hsma}}
        & 0.843 & \multirow{3}{*}{\rotatebox{90}{3086}}
    & \textbf{\textit{hsma}}
        & 0.841 & \multirow{3}{*}{\rotatebox{90}{2625}}
    & \textbf{\textit{vrfr}}
        & 0.844 & \multirow{3}{*}{\rotatebox{90}{2025}} \\
 ~ & 2
    & \textit{hsma}             & 0.765 & ~
    & \textit{hses}             & 0.853 & ~
    & \textit{hses}             & 0.844 & ~
    & \textbf{\textit{hets}}    & 0.847 & ~ \\
 ~ & 3
    & \textit{pnaive}  & 0.836 & ~
    & \textit{hets}    & 0.861 & ~
    & \textit{hets}    & 0.844 & ~
    & \textit{vsvr}    & 0.849 & ~ \\
 \hline
 \multirow{3}{*}{7} & 1
    & \textbf{\textit{trivial}}
        & 0.728 & \multirow{3}{*}{\rotatebox{90}{4454}}
    & \textbf{\textit{hsma}}
        & 0.855 & \multirow{3}{*}{\rotatebox{90}{3132}}
    & \textbf{\textit{hets}}
        & 0.843 & \multirow{3}{*}{\rotatebox{90}{2597}}
    & \textbf{\textit{hets}}
        & 0.839 & \multirow{3}{*}{\rotatebox{90}{2007}} \\
 ~ & 2
    & \textit{hsma}          & 0.744 & ~
    & \textit{hses}          & 0.862 & ~
    & \textit{hsma}          & 0.845 & ~
    & \textbf{\textit{vrfr}} & 0.842 & ~ \\
 ~ & 3
    & \textit{pnaive}        & 0.812 & ~
    & \textit{hets}          & 0.868 & ~
    & \textbf{\textit{vsvr}} & 0.849 & ~
    & \textit{vsvr}          & 0.846 & ~ \\
 \hline
 \multirow{3}{*}{8} & 1
    & \textbf{\textit{trivial}}
        & 0.736 & \multirow{3}{*}{\rotatebox{90}{4402}}
    & \textbf{\textit{hsma}}
        & 0.865 & \multirow{3}{*}{\rotatebox{90}{3159}}
    & \textbf{\textit{hets}}
        & 0.843 & \multirow{3}{*}{\rotatebox{90}{2575}}
    & \textbf{\textit{hets}}
        & 0.837 & \multirow{3}{*}{\rotatebox{90}{2002}} \\
 ~ & 2
    & \textit{hsma}          & 0.759 & ~
    & \textit{hets}          & 0.874 & ~
    & \textbf{\textit{vsvr}} & 0.848 & ~
    & \textbf{\textit{vrfr}} & 0.841 & ~ \\
 ~ & 3
    & \textit{pnaive}  & 0.820 & ~
    & \textit{hses}    & 0.879 & ~
    & \textit{hsma}    & 0.850 & ~
    & \textit{vsvr}    & 0.847 & ~ \\
 \hline
 \end{tabular}
 \end{center}
 \begin{center}
 \captionof{table}{Ranking of benchmark and horizontal models
                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
                  the table shows the ranks for cases with $2.5 < ADD < 25$
                  (and $25 < ADD < \infty$ in parentheses if they differ)}
 \label{t:hori:a}
 \begin{tabular}{|c|ccc|cccccccc|}
 \hline
 \multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
    & \multicolumn{3}{c|}{\thead{Benchmarks}}
    & \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\
 \cline{2-12}
 ~ & \textit{naive}     & \textit{fnaive}   & \textit{paive}
  & \textit{harima}    & \textit{hcroston} & \textit{hets} & \textit{hholt}
  & \textit{hhwinters} & \textit{hses}     & \textit{hsma} & \textit{htheta} \\
 \hline \hline
 3 & 11      &  7 (2) &  8 (5) & 5 (7) & 4     & 3
  &  9 (10) & 10 (9) &  2 (6) & 1     & 6 (8) \\
 4 & 11      &  7 (2) &  8 (3) & 5 (6) & 4 (5) & 3 (1)
  &  9 (10) & 10 (9) &  2 (8) & 1 (4) & 6 (7) \\
 5 & 11      &  7 (2) &  8 (4) & 5 (3) & 4 (9) & 3 (1)
  &  9 (10) & 10 (5) &  2 (8) & 1 (6) & 6 (7) \\
 6 & 11      &  8 (5) &  9 (6) & 5 (4) & 4 (7) & 2 (1)
  & 10      &  7 (2) &  3 (8) & 1 (9) & 6 (3)  \\
 7 & 11      &  8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
  &  9 (10) &  7 (2) &  3 (8) & 1 (9) & 6 (3) \\
 8 & 11      &  9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
  &  8 (10) &  7 (2) &  3 (8) & 1 (9) & 6 (3) \\
 \hline
 \end{tabular}
 \end{center}
 \
 \begin{center}
 \captionof{table}{Ranking of classical models on vertical time series
                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
                  the table shows the ranks for cases with $2.5 < ADD < 25$
                  (and $25 < ADD < \infty$ in parentheses if they differ)}
 \label{t:vert:a}
 \begin{tabular}{|c|cc|ccccc|ccccc|}
 \hline
 \multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
    & \multicolumn{2}{c|}{\thead{Benchmarks}}
    & \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}}
    & \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\
 \cline{2-13}
 ~ & \textit{hets}  & \textit{hsma}   & \textit{varima} & \textit{vets} 
  & \textit{vholt} & \textit{vses}   & \textit{vtheta} & \textit{rtarima}
  & \textit{rtets} & \textit{rtholt} & \textit{rtses}  & \textit{rttheta} \\
 \hline \hline
 3 &  2 (10) &  1  (7) & 6 (4) & 8 (6) & 10 (9)
  &  7  (5) & 11 (12) & 4 (1) & 5 (3) &  9 (8) & 3 (2) & 12 (11) \\
 4 &  2  (7) &  1 (10) & 6 (4) & 8 (6) & 10 (9)
  &  7  (5) & 12 (11) & 3 (1) & 5 (3) &  9 (8) & 4 (2) & 11 (12) \\
 5 &  2  (3) &  1 (10) & 7 (5) & 8 (7) & 10 (9)
  &  6      & 11      & 4 (1) & 5 (4) &  9 (8) & 3 (2) & 12 \\
 6 &  2  (1) &  1 (10) & 6 (5) & 8 (7) & 10 (9)
  &  7  (6) & 11 (12) & 3 (2) & 5 (4) &  9 (8) & 4 (3) & 12 (11) \\
 7 &  2  (1) &  1 (10) & 8 (5) & 7     & 10 (9)
  &  6      & 11 (12) &	5 (2) & 4     &  9 (8) & 3     & 12 (11) \\
 8 &  2  (1) &  1  (9) & 8 (5) & 7     & 10 (8)
  &  6      & 12 (10) & 5 (2) & 4     &  9 (6) & 3     & 11 \\
 \hline
 \end{tabular}
 \end{center}
 \
 \pagebreak
 \begin{center}
 \captionof{table}{Ranking of ML models on vertical time series
                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
                  the table shows the ranks for cases with $2.5 < ADD < 25$
                  (and $25 < ADD < \infty$ in parentheses if they differ)}
 \label{t:ml:a}
 \begin{tabular}{|c|cccc|cc|}
 \hline
 \multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
    & \multicolumn{4}{c|}{\thead{Benchmarks}}
    & \multicolumn{2}{c|}{\thead{ML}} \\
 \cline{2-7}
 ~ & \textit{fnaive}  & \textit{hets} & \textit{hsma}
  & \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\
 \hline \hline
 3 & 6     & 2 (5) & 1 (3) & 3 (1) & 5 (2) & 4 \\
 4 & 6 (5) & 2 (3) & 1 (6) & 3 (2) & 5 (1) & 4 \\
 5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\
 6 & 6 (5) & 2     & 1 (6) & 4     & 5 (1) & 3 \\
 7 & 6 (5) & 2 (1) & 1 (6) & 4     & 5 (2) & 3 \\
 8 & 6 (5) & 2 (1) & 1 (6) & 4     & 5 (2) & 3 \\
 \hline
 \end{tabular}
 \end{center}
 \
--- a/tex/apx/tabular_ml_models.tex
+++ b/tex/apx/tabular_ml_models.tex
@ -0,0 +1,58 @@
 \section{Tabular and Real-time Forecasts without Retraining}
 \label{tabular_ml_models}
 Regarding the structure of the feature matrix for the ML models in Sub-section
    \ref{ml_models}, we provide an alternative approach that works without
    the STL method.
 Instead of decomposing a time series and arranging the resulting
    seasonally-adjusted time series $a_t$ into a matrix $\mat{X}$, one can
    create a matrix with two types of feature columns mapped to the raw
    observations in $\vec{y}$:
 While the first group of columns takes all observations of the same time of
    day over a horizon of, for example, one week ($n_h=7$), the second group
    takes all observations covering a pre-defined time horizon, for example
    $3$ hours ($n_r=3$ for 60-minute time steps), preceding the time step to
    be fitted.
 Thus, we exploit the two-dimensional structure of time tables as well, and
    conceptually model historical and recent demand.
 The alternative feature matrix appears as follows where the first three
    columns are the historical and the last three the recent demand features:
 $$
 \vec{y}
 =
 \begin{pmatrix}
    y_T \\
    y_{T-1} \\
    \dots \\
    y_{1+n_hH}
 \end{pmatrix}
 ~~~~~
 \mat{X}
 =
 \begin{bmatrix}
    y_{T-H}              & y_{T-2H}       & \dots & y_{T-n_hH}
        & y_{T-1}        & y_{T-2}        & \dots & y_{T-n_r} \\
    y_{T-1-H}            & y_{T-1-2H}     & \dots & y_{T-1-n_hH}
        & y_{T-2}        & y_{T-3}        & \dots & y_{T-n_r-1} \\
    \dots                & \dots          & \dots & \dots
        & \dots          & \dots          & \dots & \dots \\
    y_{1+(n_h-1)H}       & y_{1+(n_h-2)H} & \dots & y_1
        & y^*_{1+n_hH-1} & y^*_{1+n_hH-2} & \dots & y^*_{1+n_hH-n_r}
 \end{bmatrix}
 $$
 \
 Being a detail, we note that the recent demand features lying on the end of
    the previous day are set to $0$, which is shown with the $^*$ notation
    above.
 This alignment of the undecomposed order data $y_t$ ensures that the ML
    models learn the two seasonal patterns independently.
 The parameters $n_h$ and $n_r$ must be adapted to the data, but we found the
    above values to work well.
 As such matrices resemble time tables, we refer to them as tabular.
 However, we found the ML models with vertical time series to outperform the
    tabular ML models, which is why we disregarded them in the study.
 This tabular form could be beneficial for UDPs with a demand that exhibits
    a weaker seasonality such as a meal delivery platform.
--- a/tex/glossary.tex
+++ b/tex/glossary.tex
@ -5,6 +5,9 @@
 \newglossaryentry{cv}{
    name=CV, description={Cross Validation}
 }
 \newglossaryentry{mase}{
    name=MASE, description={Mean Absolute Scaled Error}
 }
 \newglossaryentry{ml}{
    name=ML, description={Machine Learning}
 }
@ -27,4 +30,103 @@
    name=VRP, description={Vehicle Routing Problem}
 }
 % Model names.
 \newglossaryentry{naive}{
    name=naive, description={(Seasonal) Na\"{i}ve Method}
 }
 \newglossaryentry{fnaive}{
    name=fnaive, description={"Flexible" STL Decomposition,
                              with tuned ns parameter}
 }
 \newglossaryentry{pnaive}{
    name=pnaive, description={"Periodic" STL Decomposition,
                              with ns parameter set to large number}
 }
 \newglossaryentry{trivial}{
    name=trivial, description={Trivial Method}
 }
 \newglossaryentry{hcroston}{
    name=hcroston, description={Croston's Method,
                                trained on horizontal time series}
 }
 \newglossaryentry{hholt}{
    name=hholt, description={Holt's Linear Trend Method,
                             trained on horizontal time series}
 }
 \newglossaryentry{vholt}{
    name=vholt, description={Holt's Linear Trend Method,
                             trained on vertical time series}
 }
 \newglossaryentry{rtholt}{
    name=rtholt, description={Holt's Linear Trend Method,
                              (re)trained on vertical time series}
 }
 \newglossaryentry{hhwinters}{
    name=hhwinters, description={Holt-Winter's Seasonal Method,
                                 trained on horizontal time series}
 }
 \newglossaryentry{hses}{
    name=hses, description={Simple Exponential Smoothing Method,
                            trained on horizontal time series}
 }
 \newglossaryentry{vses}{
    name=vses, description={Simple Exponential Smoothing Method,
                            trained on vertical time series}
 }
 \newglossaryentry{rtses}{
    name=rtses, description={Simple Exponential Smoothing Method,
                             (re)trained on vertical time series}
 }
 \newglossaryentry{hsma}{
    name=hsma, description={Simple Moving Average Method,
                            trained on horizontal time series}
 }
 \newglossaryentry{htheta}{
    name=htheta, description={Theta Method,
                              trained on horizontal time series}
 }
 \newglossaryentry{vtheta}{
    name=vtheta, description={Theta Method,
                              trained on vertical time series}
 }
 \newglossaryentry{rttheta}{
    name=rttheta, description={Theta Method,
                               (re)trained on vertical time series}
 }
 \newglossaryentry{hets}{
    name=hets, description={ETS State Space Method,
                            trained on horizontal time series}
 }
 \newglossaryentry{vets}{
    name=vets, description={ETS State Space Method,
                            trained on vertical time series}
 }
 \newglossaryentry{rtets}{
    name=rtets, description={ETS State Space Method,
                             (re)trained on vertical time series}
 }
 \newglossaryentry{harima}{
    name=harima, description={Autoregressive Integrated Moving Average
                              Method,
                              trained on horizontal time series}
 }
 \newglossaryentry{varima}{
    name=varima, description={Autoregressive Integrated Moving Average
                              Method,
                              trained on vertical time series}
 }
 \newglossaryentry{rtarima}{
    name=rtarima, description={Autoregressive Integrated Moving Average
                               Method,
                               (re)trained on vertical time series}
 }
 \newglossaryentry{vrfr}{
    name=vrfr, description={Random Forest Regression Method,
                            trained on vertical time series}
 }
 \newglossaryentry{vsvr}{
    name=vsvr, description={Support Vector Regression Method,
                            trained on vertical time series}
 }
 \printglossaries
--- a/tex/preamble.tex
+++ b/tex/preamble.tex
@ -4,6 +4,12 @@
 \usepackage[acronym]{glossaries}
 \makeglossaries
 % Enable captions for figures and tables.
 \usepackage{caption}
 % Enable diagonal lines in tables.
 \usepackage{static/slashbox}
 % Make opening quotes look different than closing quotes.
 \usepackage[english=american]{csquotes}
 \MakeOuterQuote{"}
--- a/tex/references.bib
+++ b/tex/references.bib
@ -26,6 +26,16 @@ volume={1},
 pages={461--466}
 }
@article{barbour2014,
 title={psd: Adaptive, sine multitaper power spectral density estimation for R},
 author={Barbour, Andrew J and Parker, Robert L},
 year={2014},
 journal={Computers \& Geosciences},
 volume={63},
 pages={1--8},
 publisher={Elsevier}
 }
@misc{bell2018,
 title = {Forecasting at Uber: An Introduction},
 author={Bell, Franziska and Smyl, Slawek},
@ -139,6 +149,16 @@ number={1},
 pages={3--73}
 }
@article{croston1972,
 title={Forecasting and Stock Control for intermittent Demands},
 author={Croston, J Do},
 year={1972},
 journal={Journal of the Operational Research Society},
 volume={23},
 number={3},
 pages={289--303}
 }
@book{dagum2016,
 title={Seasonal Adjustment Methods and Real Time Trend-Cycle Estimation},
 author={Dagum, Estela and Bianconcini, Silvia},
@ -275,6 +295,17 @@ number={2},
 pages={287--290}
 }
@article{hyndman2006,
 title={Another Look at Measures of Forecast Accuracy},
 author={Hyndman, Rob and Koehler, Anne},
 year={2006},
 journal={International Journal of Forecasting},
 volume={22},
 number={4},
 pages={679--688},
 publisher={Elsevier}
 }
@article{hyndman2008a,
 title={Automatic Time Series Forecasting: The forecast package for R},
 author={Hyndman, Rob and Khandakar, Yeasmin},
@ -298,6 +329,18 @@ year={2018},
 publisher={OTexts}
 }
@article{kim2016,
 title={A new Metric of Absolute Percentage Error for Intermittent Demand
       Forecasts},
 author={Kim, Sungil and Kim, Heeyoung},
 year={2016},
 journal={International Journal of Forecasting},
 volume={32},
 number={3},
 pages={669--679},
 publisher={Elsevier}
 }
@article{kwiatkowski1992,
 title={Testing the null hypothesis of stationarity against the alternative of a
       unit root: How sure are we that economic time series have a unit root?},
@ -319,6 +362,17 @@ howpublished = {\url{https://eng.uber.com/neural-networks/}},
 note = {Accessed: 2020-10-01}
 }
@article{ma2018,
 title={Using the Gradient Boosting Decision Tree to Improve the Delineation of
       Hourly Rain Areas during the Summer from Advanced Himawari Imager Data},
 author={Ma, Liang and Zhang, Guoping and Lu, Er},
 year={2018},
 journal={Journal of Hydrometeorology},
 volume={19},
 number={5},
 pages={761-776},
 }
@article{masmoudi2018,
 title={The dial-a-ride problem with electric vehicles and battery
       swapping stations},
@ -330,6 +384,15 @@ volume={118},
 pages={392--420}
 }
@inproceedings{mason2000,
 title={Boosting algorithms as gradient descent},
 author={Mason, Llew and Baxter, Jonathan and Bartlett, Peter L
        and Frean, Marcus R},
 year={2000},
 booktitle={Advances in neural information processing systems},
 pages={512--518}
 }
@inproceedings{mueller1997,
 title={Predicting Time Series with Support Vector Machines},
 author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar
@ -367,6 +430,18 @@ number={5},
 pages={311--315}
 }
@article{prestwich2014,
 title={Mean-based Error Measures for Intermittent Demand Forecasting},
 author={Prestwich, Steven and Rossi, Roberto and Tarim, Armagan
        and Hnich, Brahim},
 year={2014},
 journal={International Journal of Production Research},
 volume={52},
 number={22},
 pages={6782--6791},
 publisher={Taylor \& Francis}
 }
@incollection{scholkopf1998,
 title={Fast Approximation of Support Vector Kernel Expansions, and an
       Interpretation of Clustering as Approximation in Feature Spaces},
@ -378,6 +453,13 @@ publisher={Springer},
 pages={125--132}
 }
@book{singleton2017,
 title={Urban Analytics},
 author={Singleton, Alex David and Spielman, Seth and Folch, David},
 year={2017},
 publisher={Sage}
 }
@article{smola2004,
 title={A Tutorial on Support Vector Regression},
 author={Smola, Alex and Sch{\"o}lkopf, Bernhard},
@ -398,6 +480,17 @@ pages={285--292},
 publisher={MIT, Cambridge, MA, USA}
 }
@article{syntetos2005,
 title={The Accuracy of Intermittent Demand Estimates},
 author={Syntetos, Aris and Boylan, John},
 year={2005},
 journal={International Journal of forecasting},
 volume={21},
 number={2},
 pages={303--314},
 publisher={Elsevier}
 }
@article{taylor2003,
 title={Exponential Smoothing with a Damped Multiplicative Trend},
 author={Taylor, James},
@ -442,6 +535,18 @@ volume={118},
 pages={496--512}
 }
@article{winkenbach2015,
 title={Enabling urban logistics services at La Poste through
       multi-echelon location-routing},
 author={Winkenbach, Matthias and Kleindorfer, Paul R and Spinler, Stefan},
 year={2015},
 journal={Transportation Science},
 volume={50},
 number={2},
 pages={520--540},
 publisher={INFORMS}
 }
@article{winters1960,
 title={Forecasting Sales by Exponentially Weighted Moving Averages},
 author={Winters, Peter},