Merge branch 'model-section' into develop

2020-10-04 23:42:50 +02:00 · 2020-10-04 23:42:50 +02:00 · 20abf8eade
commit 20abf8eade
parent 7c203cb87c 16443ca711
26 changed files with 1354 additions and 6 deletions
--- a/paper.pdf
+++ b/paper.pdf
--- a/paper.tex
+++ b/paper.tex
@ -19,6 +19,16 @@
 \input{tex/2_lit/3_ml/4_rf}
 \input{tex/2_lit/3_ml/5_svm}
 \input{tex/3_mod/1_intro}
+\input{tex/3_mod/2_overall}
+\input{tex/3_mod/3_grid}
+\input{tex/3_mod/4_cv}
+\input{tex/3_mod/5_mase}
+\input{tex/3_mod/6_decomp}
+\input{tex/3_mod/7_models/1_intro}
+\input{tex/3_mod/7_models/2_hori}
+\input{tex/3_mod/7_models/3_vert}
+\input{tex/3_mod/7_models/4_rt}
+\input{tex/3_mod/7_models/5_ml}
 \input{tex/4_stu/1_intro}
 \input{tex/5_con/1_intro}

@ -29,6 +39,10 @@

 \appendix
 \newpage
+\input{tex/apx/tabular_ml_models}
+\newpage
+\input{tex/apx/enhanced_feats}
+\newpage

 \bibliographystyle{static/elsarticle-harv}
 \bibliography{tex/references}
--- a/static/cross_validation_gray.png
+++ b/static/cross_validation_gray.png
--- a/static/gridification_for_paris_gray.png
+++ b/static/gridification_for_paris_gray.png
--- a/static/model_inputs_gray.png
+++ b/static/model_inputs_gray.png
--- a/static/slashbox.sty
+++ b/static/slashbox.sty
@ -0,0 +1,77 @@
+% slashbox.sty by Koichi Yasuoka, May 27, 1993
+%              minor modification by Toru Sato, May 31, 1993
+\typeout{slashbox style by K.Yasuoka, May 1993.}%
+\newbox\@slashboxa
+\newbox\@slashboxb
+\newbox\@slashboxc
+\newcount\@slashboxwd
+\newcount\@slashboxht
+\newdimen\@slashsepl
+\newdimen\@slashsepr
+\def\slashbox{%
+  \def\@slashboxpicture##1{%
+    \put(0,0){\line(##1,1){\@slashboxwd}}%
+    \put(0,\@slashboxht){\makebox(0,0)[tl]{\box\@slashboxa}}%
+    \put(\@slashboxwd,0){\makebox(0,0)[br]{\box\@slashboxb}}%
+  }%
+  \@slashbox
+}%
+\def\backslashbox{%
+  \def\@slashboxpicture##1{%
+    \put(0,\@slashboxht){\line(##1,-1){\@slashboxwd}}%
+    \put(0,0){\makebox(0,0)[bl]{\box\@slashboxa}}%
+    \put(\@slashboxwd,\@slashboxht){\makebox(0,0)[tr]{\box\@slashboxb}}%
+  }%
+  \@slashbox
+}%
+\def\@slashbox{\@ifnextchar [{\@@slashbox}{\@@slashbox[0pt]}}
+\def\@@slashbox[#1]{\@ifnextchar [{\@@@slashbox[#1]}{\@@@slashbox[#1][c]}}
+\def\@@@slashbox[#1][#2]#3#4{%
+% #1: width, #2: suppression of \tabcolsep on `l', `r', or `lr' side
+% #3: left item, #4: right item
+  \@slashsepl=\tabcolsep
+  \@slashsepr=\tabcolsep
+    \@tfor\@tempa :=#2\do{\expandafter\let
+        \csname @slashsep\@tempa\endcsname=\z@}%
+  \setbox\@slashboxa=\hbox{\strut\hskip\tabcolsep\shortstack[l]{#3}}%
+  \setbox\@slashboxb=\hbox{\shortstack[r]{#4}\hskip\tabcolsep\strut}%
+  \setbox\@slashboxa=\hbox{\raise\dp\@slashboxa\box\@slashboxa}%
+  \setbox\@slashboxb=\hbox{\raise\dp\@slashboxb\box\@slashboxb}%
+  \setbox\@slashboxc=\hbox{%
+    \@tempdima=\wd\@slashboxa
+    \advance\@tempdima by \wd\@slashboxb
+    \advance\@tempdima by \@slashsepl
+    \advance\@tempdima by \@slashsepr
+    \@tempdimb=#1\relax%
+    \ifdim\@tempdimb>\@tempdima \@tempdima=\@tempdimb\fi%
+    \@tempdimb=\ht\@slashboxa
+    \advance\@tempdimb by \dp\@slashboxa
+    \advance\@tempdimb by \ht\@slashboxb
+    \advance\@tempdimb by \dp\@slashboxb
+    \@tempcnta=\@tempdima
+    \@tempcntb=\@tempdimb
+    \advance\@tempcnta by \@tempcntb
+    \advance\@tempcnta by -1
+    \divide\@tempcnta by \@tempcntb
+    \ifnum\@tempcnta>6 \@tempcnta=6
+      \@tempdimb=0.166666666\@tempdima
+    \else
+      \ifnum\@tempcnta<1 \@tempcnta=1\fi
+      \@tempdima=\@tempdimb
+      \multiply\@tempdima by \@tempcnta
+    \fi%
+    \advance\@tempdima by -\@slashsepl
+    \advance\@tempdima by -\@slashsepr
+    \@slashboxwd=\@tempdima
+    \@slashboxht=\@tempdimb
+    \@tempcntb=\@slashsepl
+    \setlength{\unitlength}{1sp}%
+    \begin{picture}(\@slashboxwd,\@slashboxht)(\@tempcntb,0)
+    \advance\@tempdima by \@slashsepl
+    \advance\@tempdima by \@slashsepr
+    \@slashboxwd=\@tempdima
+    \@slashboxpicture{\@tempcnta}
+    \end{picture}%
+  }%
+  $\vcenter{\box\@slashboxc}$%
+}%
--- a/static/stl_gray.png
+++ b/static/stl_gray.png
--- a/tex/3_mod/1_intro.tex
+++ b/tex/3_mod/1_intro.tex
@ -1,8 +1,6 @@
 \section{Model Formulation}
 \label{mod}

-% temporary placeholders
-\label{decomp}
-\label{f:stl}
-\label{mase}
-\label{unified_cv}
+In this section, we describe how the platform's raw data are pre-processed
+    into model inputs and how the forecasting models are built and benchmarked
+    against each other.
--- a/tex/3_mod/2_overall.tex
+++ b/tex/3_mod/2_overall.tex
@ -0,0 +1,28 @@
+\subsection{Overall Approach}
+\label{approach_approach}
+
+On a conceptual level, there are three distinct aspects of the model
+    development process.
+First, a pre-processing step transforms the platform's tabular order data into
+    either time series in Sub-section \ref{grid} or feature matrices in
+    Sub-section \ref{ml_models}.
+Second, a benchmark methodology is developed in Sub-section \ref{unified_cv}
+    that compares all models on the same scale, in particular, classical
+    models with ML ones.
+Concretely, the CV approach is adapted to the peculiar requirements of
+    sub-daily and ordinal time series data.
+This is done to maximize the predictive power of all models into the future
+    and to compare them on the same scale.
+Third, the forecasting models are described with respect to their assumptions
+    and training requirements.
+Four classification dimensions are introduced:
+\begin{enumerate}
+\item \textbf{Timeliness of the Information}:
+    whole-day-ahead vs. real-time forecasts
+\item \textbf{Time Series Decomposition}: raw vs. decomposed
+\item \textbf{Algorithm Type}: "classical" statistics vs. ML
+\item \textbf{Data Sources}: pure vs. enhanced (i.e., with external data)
+\end{enumerate}
+Not all of the possible eight combinations are implemented; instead, the
+    models are varied along these dimensions to show different effects and
+    answer the research questions.
--- a/tex/3_mod/3_grid.tex
+++ b/tex/3_mod/3_grid.tex
@ -0,0 +1,95 @@
+\subsection{Gridification, Time Tables, and Time Series Generation}
+\label{grid}
+
+The platform's tabular order data are sliced with respect to both location and
+    time and then aggregated into time series where an observation tells
+    the number of orders in an area for a time step/interval.
+Figure \ref{f:grid} shows how the orders' delivery locations are each
+    matched to a square-shaped cell, referred to as a pixel, on a grid
+    covering the entire service area within a city.
+This gridification step is also applied to the pickup locations separately.
+The lower-left corner is chosen at random.
+\cite{winkenbach2015} apply the same gridification idea and slice an urban
+    area to model a location-routing problem, and \cite{singleton2017} portray
+    it as a standard method in the field of urban analytics.
+With increasing pixel sizes, the time series exhibit more order aggregation
+    with a possibly stronger demand pattern.
+On the other hand, the larger the pixels, the less valuable become the
+    generated forecasts as, for example, a courier sent to a pixel
+    preemptively then faces a longer average distance to a restaurant in the
+    pixel.
+
+\begin{center}
+\captionof{figure}{Gridification for delivery locations in Paris with a pixel
+                   size of $1~\text{km}^2$}
+\label{f:grid}
+\includegraphics[width=.8\linewidth]{static/gridification_for_paris_gray.png}
+\end{center}
+
+After gridification, the ad-hoc orders within a pixel are aggregated by their
+    placement timestamps into sub-daily time steps of pre-defined lengths
+    to obtain a time table as exemplified in Figure \ref{f:timetable} with
+    one-hour intervals.
+
+\begin{center}
+\captionof{figure}{Aggregation into a time table with hourly time steps}
+\label{f:timetable}
+\begin{tabular}{|c||*{9}{c|}}
+    \hline
+    \backslashbox{Time}{Day} & \makebox[2em]{\ldots}
+        & \makebox[3em]{Mon} & \makebox[3em]{Tue}
+        & \makebox[3em]{Wed} & \makebox[3em]{Thu}
+        & \makebox[3em]{Fri} & \makebox[3em]{Sat}
+        & \makebox[3em]{Sun} & \makebox[2em]{\ldots} \\
+    \hline
+    \hline
+    11:00 & \ldots & $y_{11,Mon}$ & $y_{11,Tue}$ & $y_{11,Wed}$ & $y_{11,Thu}$
+                   & $y_{11,Fri}$ & $y_{11,Sat}$ & $y_{11,Sun}$ & \ldots \\
+    \hline
+    12:00 & \ldots & $y_{12,Mon}$ & $y_{12,Tue}$ & $y_{12,Wed}$ & $y_{12,Thu}$
+                   & $y_{12,Fri}$ & $y_{12,Sat}$ & $y_{12,Sun}$ & \ldots \\
+    \hline
+    \ldots & \ldots & \ldots & \ldots & \ldots
+           & \ldots & \ldots & \ldots & \ldots & \ldots \\
+    \hline
+    20:00 & \ldots & $y_{20,Mon}$ & $y_{20,Tue}$ & $y_{20,Wed}$ & $y_{20,Thu}$
+                   & $y_{20,Fri}$ & $y_{20,Sat}$ & $y_{20,Sun}$ & \ldots \\
+    \hline
+    21:00 & \ldots & $y_{21,Mon}$ & $y_{21,Tue}$ & $y_{21,Wed}$ & $y_{21,Thu}$
+                   & $y_{21,Fri}$ & $y_{21,Sat}$ & $y_{21,Sun}$ & \ldots \\
+    \hline
+    \ldots & \ldots & \ldots & \ldots & \ldots
+           & \ldots & \ldots & \ldots & \ldots & \ldots \\
+    \hline
+\end{tabular}
+\end{center}
+\
+
+Consequently, each $y_{t,d}$ in Figure \ref{f:timetable} is the number of
+    all orders within the pixel for the time of day $t$ and day of week
+    $d$ ($y_t$ and $y_{t,d}$ are the same but differ in that the latter
+    acknowledges a 2D view).
+The same trade-off as with gridification applies:
+The shorter the interval, the weaker is the demand pattern to be expected in
+    the time series due to less aggregation while longer intervals lead to
+    less usable forecasts.
+We refer to time steps by their start time, and their number per day, $H$,
+    is constant.
+Given a time table as in Figure \ref{f:timetable} there are two ways to
+    generate a time series by slicing:
+\begin{enumerate}
+    \item \textbf{Horizontal View}:
+    Take only the order counts for a given time of the day
+    \item \textbf{Vertical View}:
+    Take all order counts and remove the double-seasonal pattern induced
+    by the weekday and time of the day with decomposition
+\end{enumerate}
+Distinct time series are retrieved by iterating through the time tables either
+    horizontally or vertically in increments of a single time step.
+Another property of a generated time series is its length, which, following
+    the next sub-section, can be interpreted as the sum of the production
+    training set and the test day.
+In summary, a distinct time series is generated from the tabular order data
+    based on a configuration of parameters for the dimensions pixel size,
+    number of daily time steps $H$, shape (horizontal vs. vertical), length,
+    and the time step to be predicted.
--- a/tex/3_mod/4_cv.tex
+++ b/tex/3_mod/4_cv.tex
@ -0,0 +1,86 @@
+\subsection{Unified Cross-Validation and Training, Validation, and Test Sets}
+\label{unified_cv}
+
+The standard $k$-fold CV, which assumes no structure in the individual
+    features of the samples, as shown in $\mat{X}$ above, is adapted to the
+    ordinal character of time series data:
+A model must be evaluated on observations that occurred strictly after the
+    ones used for training as, otherwise, the model knows about the future.
+Furthermore, some models predict only a single to a few time steps before
+    being retrained, while others predict an entire day without retraining
+    (cf., Sub-section \ref{ml_models}).
+Consequently, we must use a unified time interval wherein all forecasts are
+    made first before the entire interval is evaluated.
+As whole days are the longest prediction interval for models without
+    retraining, we choose that as the unified time interval.
+In summary, our CV methodology yields a distinct best model per pixel and day
+    to be forecast.
+Whole days are also practical for managers who commonly monitor, for example,
+    the routing and thus the forecasting performance on a day-to-day basis.
+Our methodology assumes that the models are trained at least once per day.
+As we create operational forecasts into the near future in this paper,
+    retraining all models with the latest available data is a logical step.
+
+\begin{center}
+\captionof{figure}{Training, validation, and test sets
+                   during cross validation}
+\label{f:cv}
+\includegraphics[width=.8\linewidth]{static/cross_validation_gray.png}
+\end{center}
+
+The training, validation, and test sets are defined as follows.
+To exemplify the logic, we refer to Figure \ref{f:cv} that shows the calendar
+    setup (i.e., weekdays on the x-axis) for three days $T_1$, $T_2$, and
+    $T_3$ (shown in dark gray) for which we generate forecasts.
+Each of these days is, by definition, a test day, and the test set comprises
+    all time series, horizontal or vertical, whose last observation lies on
+    that day.
+With an assumed training horizon of three weeks, the 21 days before each of
+    the test days constitute the corresponding training sets (shown in lighter
+    gray on the same rows as $T_1$, $T_2$, and $T_3$).
+There are two kinds of validation sets, depending on the decision to be made.
+First, if a forecasting method needs parameter tuning, the original training
+    set is divided into as many equally long series as validation days are
+    needed to find stable parameters.
+The example shows three validation days per test day named $V_n$ (shown
+    in darker gray below each test day).
+The $21 - 3 = 18$ preceding days constitute the training set corresponding to
+    a validation day.
+To obtain the overall validation error, the three errors are averaged.
+We call these \textit{inner} validation sets because they must be repeated
+    each day to re-tune the parameters and because the involved time series
+    are true subsets of the original series.
+Second, to find the best method per day and pixel, the same averaging logic
+    is applied on the outer level.
+For example, if we used two validation days to find the best method for $T_3$,
+    we would average the errors of $T_1$ and $T_2$ for each method and select
+    the winner; then, $T_1$ and $T_2$ constitute an \textit{outer} validation
+    set.
+Whereas the number of inner validation days is method-specific and must be
+    chosen before generating any test day forecasts in the first place, the
+    number of outer validation days may be varied after the fact and is
+    determined empirically as we show in Section \ref{stu}.
+
+Our unified CV approach is also optimized for large-scale production settings,
+    for example, at companies like Uber.
+As \cite{bell2018} note, there is a trade-off as to when each of the
+    inner time series in the example begins.
+While the forecasting accuracy likely increases with more training days,
+    supporting inner series with increasing lengths, cutting the series
+    to the same length allows caching the forecasts and errors.
+In the example, $V_3$, $V_5$, and $V_7$, as well as $V_6$ and $V_8$ are
+    identical despite belonging to different inner validation sets.
+Caching is also possible on the outer level when searching for an optimal
+    number of validation days for model selection.
+We achieved up to 80\% cache hit ratios in our implementation in the
+    empirical study, thereby saving computational resources by the same
+    amount.
+Lastly, we assert that our suggested CV, because of its being unified
+    around whole test days and usage of fix-sized time series, is also
+    suitable for creating consistent learning curves and, thus, answering
+    \textbf{Q3} on the relationship between forecast accuracy and amount of
+    historic data:
+We simply increase the length of the outer training set holding the test day
+    fixed.
+Thus, independent of a method's need for parameter tuning, all methods have
+    the same demand history available for each test day forecast.
--- a/tex/3_mod/5_mase.tex
+++ b/tex/3_mod/5_mase.tex
@ -0,0 +1,87 @@
+\subsection{Accuracy Measures}
+\label{mase}
+
+Choosing an error measure for both model selection and evaluation is not
+    straightforward when working with intermittent demand, as shown, for
+    example, by \cite{syntetos2005}, and one should understand the trade-offs
+    between measures.
+\cite{hyndman2006} provide a study of measures with real-life data taken from
+    the popular M3-competition and find that most standard measures degenerate
+    under many scenarios.
+They also provide a classification scheme for which we summarize the main
+    points as they apply to the UDP case:
+\begin{enumerate}
+\item \textbf{Scale-dependent Errors}:
+The error is reported in the same unit as the raw data.
+Two popular examples are the root mean square error (RMSE) and mean absolute
+    error (MAE).
+They may be used for model selection and evaluation within a pixel, and are
+    intuitively interpretable; however, they may not be used to compare errors
+    of, for example, a low-demand pixel (e.g., at the UDP's service
+    boundary) with that of a high-demand pixel (e.g., downtown).
+\item \textbf{Percentage Errors}:
+The error is derived from the percentage errors of individual forecasts per
+    time step, and is also intuitively interpretable.
+A popular example is the mean absolute percentage error (MAPE) that is the
+    primary measure in most forecasting competitions.
+Whereas such errors could be applied both within and across pixels, they
+    cannot be calculated reliably for intermittent demand.
+If only one time step exhibits no demand, the result is a divide-by-zero
+    error.
+This often occurs even in high-demand pixels due to the slicing.
+\item \textbf{Relative Errors}:
+A workaround is to calculate a scale-dependent error for the test day and
+    divide it by the same measure calculated with forecasts of a simple
+    benchmark method (e.g., na\"{i}ve method).
+An example could be
+    $\text{RelMAE} = \text{MAE} / \text{MAE}_\text{bm}$.
+Nevertheless, even simple methods create (near-)perfect forecasts, and then
+    $\text{MAE}_\text{bm}$ becomes (close to) $0$.
+These numerical instabilities occurred so often in our studies that we argue
+    against using such measures.
+\item \textbf{Scaled Errors}:
+\cite{hyndman2006} contribute this category and introduce the mean absolute
+    scaled error (\gls{mase}).
+It is defined as the MAE from the actual forecasting method on the test day
+    (i.e., "out-of-sample") divided by the MAE from the (seasonal) na\"{i}ve
+    method on the entire training set (i.e., "in-sample").
+A MASE of $1$ indicates that a forecasting method has the same accuracy
+    on the test day as the (seasonal) na\"{i}ve method applied on a longer
+    horizon, and lower values imply higher accuracy.
+Within a pixel, its results are identical to the ones obtained with MAE.
+Also, we acknowledge recent publications, for example, \cite{prestwich2014} or
+    \cite{kim2016}, showing other ways of tackling the difficulties mentioned.
+However, only the MASE provided numerically stable results for all
+    forecasts in our study.
+\end{enumerate}
+Consequently, we use the MASE with a seasonal na\"{i}ve benchmark as the
+    primary measure in this paper.
+With the previously introduced notation, it is defined as follows:
+$$
+\text{MASE}
+:=
+\frac{\text{MAE}_{\text{out-of-sample}}}{\text{MAE}_{\text{in-sample}}}
+=
+\frac{\text{MAE}_{\text{forecasts}}}{\text{MAE}_{\text{training}}}
+=
+\frac{\frac{1}{H} \sum_{h=1}^H |y_{T+h} - \hat{y}_{T+h}|}
+     {\frac{1}{T-k} \sum_{t=k+1}^T |y_{t} - y_{t-k}|}
+$$
+The denominator can only become $0$ if the seasonal na\"{i}ve benchmark makes
+    a perfect forecast on each day in the training set except the first seven
+    days, which never happened in our case study involving hundreds of
+    thousands of individual model trainings.
+Further, as per the discussion in the subsequent Section \ref{decomp}, we also
+    calculate peak-MASEs where we leave out the time steps of non-peak times
+    from the calculations.
+For this analysis, we define all time steps that occur at lunch (i.e., noon to
+    2 pm) and dinner time (i.e., 6 pm to 8 pm) as peak.
+As time steps in non-peak times typically average no or very low order counts,
+    a UDP may choose to not actively forecast these at all and be rather
+    interested in the accuracies of forecasting methods during peaks only.
+
+We conjecture that percentage error measures may be usable for UDPs facing a
+    higher overall demand with no intra-day down-times in between but have to
+    leave that to a future study.
+Yet, even with high and steady demand, divide-by-zero errors are likely to
+    occur.
--- a/tex/3_mod/6_decomp.tex
+++ b/tex/3_mod/6_decomp.tex
@ -0,0 +1,76 @@
+\subsection{Time Series Decomposition}
+\label{decomp}
+
+Concerning the time table in Figure \ref{f:timetable}, a seasonal demand
+    pattern is inherent to both horizontal and vertical time series.
+First, the weekday influences if people eat out or order in with our partner
+    receiving more orders on Thursday through Saturday than the other four
+    days.
+This pattern is part of both types of time series.
+Second, on any given day, demand peaks occur around lunch and dinner times.
+This only regards vertical series.
+Statistical analyses show that horizontally sliced time series indeed exhibit
+    a periodicity of $k=7$, and vertically sliced series only yield a seasonal
+    component with a regular pattern if the periodicity is set to the product
+    of the number of weekdays and the daily time steps indicating a distinct
+    intra-day pattern per weekday.
+
+Figure \ref{f:stl} shows three exemplary STL decompositions for a
+    $1~\text{km}^2$ pixel and a vertical time series with 60-minute time steps
+    (on the x-axis) covering four weeks:
+With the noisy raw data $y_t$ on the left, the seasonal and trend components,
+    $s_t$ and $t_t$, are depicted in light and dark gray for increasing $ns$
+    parameters.
+The plots include (seasonal) na\"{i}ve forecasts for the subsequent test day
+    as dotted lines.
+The remainder components $r_t$ are not shown for conciseness.
+The periodicity is set to $k = 7 * 12 = 84$ as our industry partner has $12$
+    opening hours per day.
+
+\begin{center}
+\captionof{figure}{STL decompositions for a medium-demand pixel with hourly
+                   time steps and periodicity $k=84$}
+\label{f:stl}
+\includegraphics[width=.95\linewidth]{static/stl_gray.png}
+\end{center}
+
+As described in Sub-section \ref{stl}, with $k$ being implied by the
+    application, at the very least, the length of the seasonal smoothing
+    window, represented by the $ns$ parameter, must be calibrated by the
+    forecaster:
+It controls how many past observations go into each smoothened $s_t$.
+Many practitioners, however, skip this step and set $ns$ to a big number, for
+    example, $999$, then referred to as "periodic."
+For the other parameters, it is common to use the default values as
+    specified in \cite{cleveland1990}.
+The goal is to find a decomposition with a regular pattern in $s_t$.
+In Figure \ref{f:stl}, this is not true for $ns=7$ where, for
+    example, the four largest bars corresponding to the same time of day a
+    week apart cannot be connected by an approximately straight line.
+On the contrary, a regular pattern in the most extreme way exists for
+    $ns=999$, where the same four largest bars are of the same height.
+This observation holds for each time step of the day.
+For $ns=11$, $s_t$ exhibits a regular pattern whose bars adapt over time:
+The pattern is regular as bars corresponding to the same time of day can be
+    connected by approximately straight lines, and it is adaptive as these
+    lines are not horizontal.
+The trade-off between small and large values for $ns$ can thus be interpreted
+    as allowing the average demand during peak times to change over time:
+If demand is intermittent at non-peak times, it is reasonable to expect the
+    bars to change over time as only the relative differences between peak and
+    non-peak times impact the bars' heights with the seasonal component being
+    centered around $0$.
+To confirm the goodness of a decomposition statistically, one way is to verify
+    that $r_t$ can be modeled as a typical error process like white noise
+    $\epsilon_t$.
+
+However, we suggest an alternative way of calibrating the STL method in an
+    automated fashion based on our unified CV approach.
+As hinted at in Figure \ref{f:stl}, we interpret an STL decomposition as a
+    forecasting method on its own by just adding the (seasonal) na\"{i}ve
+    forecasts for $s_t$ and $t_t$ and predicting $0$ for $r_t$.
+Then, the $ns$ parameter is tuned just like a parameter for an ML model.
+To the best of our knowledge, this has not yet been proposed before.
+Conceptually, forecasting with the STL method can be viewed as a na\"{i}ve
+    method with built-in smoothing, and it outperformed all other
+    benchmark methods in all cases.
--- a/tex/3_mod/7_models/1_intro.tex
+++ b/tex/3_mod/7_models/1_intro.tex
@ -0,0 +1,20 @@
+\subsection{Forecasting Models}
+\label{models}
+
+This sub-section describes the concrete models in our study.
+Figure \ref{f:inputs} shows how we classify them into four families with
+    regard to the type of the time series, horizontal or vertical, and the
+    moment at which a model is trained:
+Solid lines indicate that the corresponding time steps lie before the
+    training, and dotted lines show the time horizon predicted by a model.
+For conciseness, we only show the forecasts for one test day.
+The setup is the same for each inner validation day.
+
+\
+
+\begin{center}
+\captionof{figure}{Classification of the models by input type and training
+                   moment}
+\label{f:inputs}
+\includegraphics[width=.95\linewidth]{static/model_inputs_gray.png}
+\end{center}
--- a/tex/3_mod/7_models/2_hori.tex
+++ b/tex/3_mod/7_models/2_hori.tex
@ -0,0 +1,42 @@
+\subsubsection{Horizontal and Whole-day-ahead Forecasts.}
+\label{hori}
+
+The upper-left in Figure \ref{f:inputs} illustrates the simplest way to
+    generate forecasts for a test day before it has started:
+For each time of the day, the corresponding horizontal slice becomes the input
+    for a model.
+With whole days being the unified time interval, each model is trained $H$
+    times, providing a one-step-ahead forecast.
+While it is possible to have models of a different type be selected per time
+    step, that did not improve the accuracy in the empirical study.
+As the models in this family do not include the test day's demand data in
+    their training sets, we see them as benchmarks to answer \textbf{Q4},
+    checking if a UDP can take advantage of real-time information.
+The models in this family are as follows; we use prefixes, such as "h" here,
+    when methods are applied in other families as well:
+\begin{enumerate}
+\item \textit{\gls{naive}}:
+          Observation from the same time step one week prior
+\item \textit{\gls{trivial}}:
+          Predict $0$ for all time steps
+\item \textit{\gls{hcroston}}:
+          Intermittent demand method introduced by \cite{croston1972}
+\item \textit{\gls{hholt}},
+      \textit{\gls{hhwinters}},
+      \textit{\gls{hses}},
+      \textit{\gls{hsma}}, and
+      \textit{\gls{htheta}}:
+          Exponential smoothing without calibration
+\item \textit{\gls{hets}}:
+          ETS calibrated as described by \cite{hyndman2008b}
+\item \textit{\gls{harima}}:
+          ARIMA calibrated as described by \cite{hyndman2008a}
+\end{enumerate}
+\textit{naive} and \textit{trivial} provide an absolute benchmark for the
+    actual forecasting methods.
+\textit{hcroston} is often mentioned in the context of intermittent demand;
+    however, the method did not perform well at all.
+Besides \textit{hhwinters} that always fits a seasonal component, the
+    calibration heuristics behind \textit{hets} and \textit{harima} may do so
+    as well.
+With $k=7$, an STL decomposition is unnecessary here.
--- a/tex/3_mod/7_models/3_vert.tex
+++ b/tex/3_mod/7_models/3_vert.tex
@ -0,0 +1,39 @@
+\subsubsection{Vertical and Whole-day-ahead Forecasts without Retraining.}
+\label{vert}
+
+The upper-right in Figure \ref{f:inputs} shows an alternative way to
+    generate forecasts for a test day before it has started:
+First, a seasonally-adjusted time series $a_t$ is obtained from a vertical
+    time series by STL decomposition.
+Then, the actual forecasting model, trained on $a_t$, makes an $H$-step-ahead
+    prediction.
+Lastly, we add the $H$ seasonal na\"{i}ve forecasts for the seasonal component
+    $s_t$ to them to obtain the actual predictions for the test day.
+Thus, only one training is required per model type, and no real-time data is
+    used.
+By decomposing the raw time series, all long-term patterns are assumed to be
+    in the seasonal component $s_t$, and $a_t$ only contains the level with
+    a potential trend and auto-correlations.
+The models in this family are:
+\begin{enumerate}
+\item \textit{\gls{fnaive}},
+      \textit{\gls{pnaive}}:
+          Sum of STL's trend and seasonal components' na\"{i}ve forecasts
+\item \textit{\gls{vholt}},
+      \textit{\gls{vses}}, and
+      \textit{\gls{vtheta}}:
+          Exponential smoothing without calibration and seasonal
+                       fit
+\item \textit{\gls{vets}}:
+          ETS calibrated as described by \cite{hyndman2008b}
+\item \textit{\gls{varima}}:
+          ARIMA calibrated as described by \cite{hyndman2008a}
+\end{enumerate}
+As mentioned in Sub-section \ref{unified_cv}, we include the sum of the
+    (seasonal) na\"{i}ve forecasts of the STL's trend and seasonal components
+    as forecasts on their own:
+For \textit{fnaive}, we tune the "flexible" $ns$ parameter, and for
+    \textit{pnaive}, we set it to a "periodic" value.
+Thus, we implicitly assume that there is no signal in the remainder $r_t$, and
+    predict $0$ for it.
+\textit{fnaive} and \textit{pnaive} are two more simple benchmarks.
--- a/tex/3_mod/7_models/4_rt.tex
+++ b/tex/3_mod/7_models/4_rt.tex
@ -0,0 +1,22 @@
+\subsubsection{Vertical and Real-time Forecasts with Retraining.}
+\label{rt}
+
+The lower-left in Figure \ref{f:inputs} shows how models trained on vertical
+    time series are extended with real-time order data as it becomes available
+    during a test day:
+Instead of obtaining an $H$-step-ahead forecast, we retrain a model after
+    every time step and only predict one step.
+The remainder is as in the previous sub-section, and the models are:
+\begin{enumerate}
+\item \textit{\gls{rtholt}},
+      \textit{\gls{rtses}}, and
+      \textit{\gls{rttheta}}:
+          Exponential smoothing without calibration and seasonal fit
+\item \textit{\gls{rtets}}:
+          ETS calibrated as described by \cite{hyndman2008b}
+\item \textit{\gls{rtarima}}:
+          ARIMA calibrated as described by \cite{hyndman2008a}
+\end{enumerate}
+Retraining \textit{fnaive} and \textit{pnaive} did not increase accuracy, and
+    thus we left them out.
+A downside of this family is the significant increase in computing costs.
--- a/tex/3_mod/7_models/5_ml.tex
+++ b/tex/3_mod/7_models/5_ml.tex
@ -0,0 +1,54 @@
+\subsubsection{Vertical and Real-time Forecasts without Retraining.}
+\label{ml_models}
+
+The lower-right in Figure \ref{f:inputs} shows how ML models take
+    real-time order data into account without retraining.
+Based on the seasonally-adjusted time series $a_t$, we employ the feature
+    matrix and label vector representations from Sub-section \ref{learning}
+    and set $n$ to the number of daily time steps, $H$, to cover all potential
+    auto-correlations.
+The ML models are trained once before a test day starts.
+For training, the matrix and vector are populated such that $y_T$ is set to
+    the last time step of the day before the forecasts, $a_T$.
+As the splitting during CV is done with whole days, the \gls{ml} models are
+    trained with training sets consisting of samples from all times of a day
+    in an equal manner.
+Thus, the ML models learn to predict each time of the day.
+For prediction on a test day, the $H$ observations preceding the time
+    step to be forecast are used as the input vector after seasonal
+    adjustment.
+As a result, real-time data are included.
+The models in this family are:
+\begin{enumerate}
+\item \textit{\gls{vrfr}}: RF trained on the matrix as described
+\item \textit{\gls{vsvr}}: SVR trained on the matrix as described
+\end{enumerate}
+We tried other ML models such as gradient boosting machines but found
+    only RFs and SVRs to perform well in our study.
+In the case of gradient boosting machines, this is to be expected as they are
+    known not to perform well in the presence of high noise - as is natural
+    with low count data - as shown, for example, by \cite{ma2018} or
+    \cite{mason2000}.
+Also, deep learning methods are not applicable as the feature matrices only
+    consist of several hundred to thousands of rows (cf., Sub-section
+    \ref{params}).
+In \ref{tabular_ml_models}, we provide an alternative feature matrix
+    representation that exploits the two-dimensional structure of time tables
+    without decomposing the time series.
+In \ref{enhanced_feats}, we show how feature matrices are extended
+    to include predictors other than historical order data.
+However, to answer \textbf{Q5} already here, none of the external data sources
+    improves the results in our study.
+Due to the high number of time series in our study, to investigate why
+    no external sources improve the forecasts, we must us some automated
+    approach to analyzing individual time series.
+\cite{barbour2014} provide a spectral density estimation approach, called
+    the Shannon entropy, that measures the signal-to-noise ratio in a
+    database with a number normalized between 0 and 1 where lower values
+    indicate a higher signal-to-noise ratio.
+We then looked at averages of the estimates on a daily level per pixel and
+    find that including any of the external data sources from
+    \ref{enhanced_feats} always leads to significantly lower signal-to-noise
+    ratios.
+Thus, we conclude that at least for the demand faced by our industry partner
+    the historical data contains all of the signal.
--- a/tex/4_stu/1_intro.tex
+++ b/tex/4_stu/1_intro.tex
@ -1,2 +1,5 @@
 \section{Empirical Study: A Meal Delivery Platform in Europe}
 \label{stu}
+
+% temporary placeholder
+\label{params}
--- a/tex/apx/case_study.tex
+++ b/tex/apx/case_study.tex
@ -0,0 +1,54 @@
+\section{Raw Order Data in the Case Study}
+\label{dataset}
+
+The raw data for the empirical study in Section \ref{stu} was provided by a
+    meal delivery platform operating in five cities in France in 2016.
+The platform received a total of 686,385 orders distributed as follows:
+
+\
+\begin{center}
+\begin{tabular}{llr}
+    \hline
+	\thead{City} & \thead{Launch Day} & \thead{Orders} \\
+	\hline
+    Bordeaux & July 18 & 64,012 \\
+    Lille & October 30 & 14,362 \\
+    Lyon & February 21 & 214,635 \\
+    Nantes & October 31 & 12,900 \\
+    Paris & March 7 & 380,476 \\
+\end{tabular}
+\end{center}
+\
+
+The part of the database relevant for forecasting can be thought of as one
+    table per city, where each row represents one order and consists of the
+    following groups of columns:
+\begin{enumerate}
+\item \textbf{Restaurant Data}
+    \begin{enumerate}
+    \item unique ID and name
+    \item pickup location as latitude-longitude pair
+    \end{enumerate}
+\item \textbf{Customer Data}
+    \begin{enumerate}
+    \item unique ID, name, and phone number
+    \item delivery location as latitude-longitude pair (mostly physical
+	      addresses but also public spots)
+	\end{enumerate}
+\item \textbf{Timestamps}
+    \begin{enumerate}
+    \item placement via the smartphone app
+    \item fulfillment workflow (pickup, delivery, cancellation, re-deliveries)
+    \end{enumerate}
+\item \textbf{Courier Data}
+    \begin{enumerate}
+    \item unique ID, name, and phone number
+    \item shift data (begin, breaks, end)
+    \item average speed
+    \end{enumerate}
+\item \textbf{Order Details}
+    \begin{enumerate}
+    \item meals and drinks
+    \item prices and discounts granted
+    \end{enumerate}
+\end{enumerate}
--- a/tex/apx/enhanced_feats.tex
+++ b/tex/apx/enhanced_feats.tex
@ -0,0 +1,121 @@
+\section{Enhancing Forecasting Models with External Data}
+\label{enhanced_feats}
+
+In this appendix, we show how the feature matrix in Sub-section
+    \ref{ml_models} can be extended with features other than historical order
+    data.
+Then, we provide an overview of what external data we tried out as predictors
+    in our empirical study.
+
+\subsection{Enhanced Feature Matrices}
+
+Feature matrices can naturally be extended by appending new feature columns
+    $x_{t,f}$ or $x_f$ on the right where the former represent predictors
+    changing throughout a day and the latter being static either within a
+    pixel or across a city.
+$f$ refers to an external predictor variable, such as one of the examples
+    listed below.
+In the SVR case, the columns should be standardized before fitting as external
+    predictors are most likely on a different scale than the historic order
+    data.
+Thus, for a matrix with seasonally-adjusted order data $a_t$ in it, an
+    enhanced matrix looks as follows:
+
+$$
+\vec{y}
+=
+\begin{pmatrix}
+    a_T \\
+    a_{T-1} \\
+    \dots \\
+    a_{H+1}
+\end{pmatrix}
+~~~~~
+\mat{X}
+=
+\begin{bmatrix}
+    a_{T-1}         & a_{T-2} & \dots & a_{T-H}     & ~~~
+        & x_{T,A}   & \dots   & x_{B} & \dots \\
+    a_{T-2}         & a_{T-3} & \dots & a_{T-(H+1)} & ~~~
+        & x_{T-1,A} & \dots   & x_{B} & \dots \\
+    \dots           & \dots   & \dots & \dots       & ~~~
+        & \dots     & \dots   & \dots & \dots \\
+    a_H             & a_{H-1} & \dots & a_1         & ~~~
+        & x_{H+1,A} & \dots   & x_{B} & \dots
+\end{bmatrix}
+$$
+\
+
+Similarly, we can also enhance the tabular matrices from
+    \ref{tabular_ml_models}.
+The same comments as for their pure equivalents in Sub-section \ref{ml_models}
+    apply, in particular, that ML models trained with an enhanced matrix can
+    process real-time data without being retrained.
+    
+\subsection{External Data in the Empirical Study}
+\label{external_data}
+
+In the empirical study, we tested four groups of external features that we
+    briefly describe here.
+
+\vskip 0.1in
+
+\textbf{Calendar Features}:
+\begin{itemize}
+    \item Time of day (as synthesized integers: e.g., 1,050 for 10:30 am,
+                       or 1,600 for 4 pm)
+    \item Day of week (as one-hot encoded booleans)
+    \item Work day or not (as booleans)
+\end{itemize}
+
+\vskip 0.1in
+
+\textbf{Features derived from the historical Order Data}:
+\begin{itemize}
+    \item Number of pre-orders for a time step (as integers)
+    \item 7-day SMA of the percentages of discounted orders (as percentages):
+          The platform is known for running marketing campaigns aimed at
+          first-time customers at irregular intervals. Consequently, the
+          order data show a wave-like pattern of coupons redeemed when looking
+          at the relative share of discounted orders per day.
+\end{itemize}
+
+\vskip 0.1in
+
+\textbf{Neighborhood Features}:
+\begin{itemize}
+    \item Ambient population (as integers) as obtained from the ORNL LandScan
+          database
+    \item Number of active platform restaurants (as integers)
+    \item Number of overall restaurants, food outlets, retailers, and other
+          businesses (as integers) as obtained from the Google Maps and Yelp
+          web services
+\end{itemize}
+
+\vskip 0.1in
+
+\textbf{Real-time Weather} (raw data obtained from IBM's
+                            Wunderground database):
+\begin{itemize}
+    \item Absolute temperature, wind speed, and humidity
+          (as decimals and percentages)
+    \item Relative temperature with respect to 3-day and 7-day historical
+          means (as decimals)
+    \item Day vs. night defined by sunset (as booleans)
+    \item Summarized description (as indicators $-1$, $0$, and $+1$)
+    \item Lags of the absolute temperature and the summaries covering the
+          previous three hours
+\end{itemize}
+
+\vskip 0.1in
+
+Unfortunately, we must report that none of the mentioned external data
+    improved the accuracy of the forecasts.
+Some led to models overfitting the data, which could not be regulated.
+Manual tests revealed that real-time weather data are the most promising
+    external source.
+Nevertheless, the data provided by IBM's Wunderground database originate from
+    weather stations close to airports, which implies that we only have the
+    same aggregate weather data for the entire city.
+If weather data is available on a more granular basis in the future, we see
+    some potential for exploitation.
--- a/tex/apx/peak_results.tex
+++ b/tex/apx/peak_results.tex
@ -0,0 +1,261 @@
+\section{Forecasting Accuracies during Peak Times}
+\label{peak_results}
+
+This appendix shows all result tables from the main text with the MASE
+    averages calculated from time steps within peak times.
+Peaks are the times of the day where the typical customer has a lunch or
+    dinner meal and defined to be either from 12 pm to 2 pm or from 6 pm to
+    8 pm.
+While the exact decimals of the MASEs differ from the ones in the main
+    text, the relative ranks of the forecasting methods are the same except in
+    rare cases.
+
+\begin{center}
+\captionof{table}{Top-3 models by training weeks and average demand
+                  ($1~\text{km}^2$ pixel size, 60-minute time steps)}
+\label{t:results:a}
+\begin{tabular}{|c|c|*{12}{c|}}
+
+\hline
+\multirow{3}{*}{\rotatebox{90}{\thead{Training}}}
+    & \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}}
+    & \multicolumn{3}{c|}{\thead{No Demand}}
+    & \multicolumn{3}{c|}{\thead{Low Demand}}
+    & \multicolumn{3}{c|}{\thead{Medium Demand}}
+    & \multicolumn{3}{c|}{\thead{High Demand}} \\
+~ & ~
+    & \multicolumn{3}{c|}{(0 - 2.5)}
+    & \multicolumn{3}{c|}{(2.5 - 10)}
+    & \multicolumn{3}{c|}{(10 - 25)}
+    & \multicolumn{3}{c|}{(25 - $\infty$)} \\
+\cline{3-14}
+~ & ~
+    & Method & MASE & $n$
+    & Method & MASE & $n$
+    & Method & MASE & $n$
+    & Method & MASE & $n$ \\
+
+\hline \hline
+\multirow{3}{*}{3} & 1
+    & \textbf{\textit{trivial}}
+        & 0.794 & \multirow{3}{*}{\rotatebox{90}{4586}}
+    & \textbf{\textit{hsma}}
+        & 0.817 & \multirow{3}{*}{\rotatebox{90}{2975}}
+    & \textbf{\textit{hsma}}
+        & 0.838 & \multirow{3}{*}{\rotatebox{90}{2743}}
+    & \textbf{\textit{rtarima}}
+        & 0.871 & \multirow{3}{*}{\rotatebox{90}{2018}} \\
+~ & 2
+    & \textit{hsma}    & 0.808 & ~
+    & \textit{hses}    & 0.847 & ~
+    & \textit{hses}    & 0.851 & ~
+    & \textit{rtses}   & 0.872 & ~ \\
+~ & 3
+    & \textit{pnaive}  & 0.938 & ~
+    & \textit{hets}    & 0.848 & ~
+    & \textit{hets}    & 0.853 & ~
+    & \textit{rtets}   & 0.874 & ~ \\
+
+\hline
+\multirow{3}{*}{4} & 1
+    & \textbf{\textit{trivial}}
+        & 0.791 & \multirow{3}{*}{\rotatebox{90}{4532}}
+    & \textbf{\textit{hsma}}
+        & 0.833 & \multirow{3}{*}{\rotatebox{90}{3033}}
+    & \textbf{\textit{hsma}}
+        & 0.839 & \multirow{3}{*}{\rotatebox{90}{2687}}
+    & \textbf{\textit{vrfr}}
+        & 0.848 & \multirow{3}{*}{\rotatebox{90}{2016}} \\
+~ & 2
+    & \textit{hsma}             & 0.794 & ~
+    & \textit{hses}             & 0.838 & ~
+    & \textit{hses}             & 0.847 & ~
+    & \textbf{\textit{rtarima}} & 0.851 & ~ \\
+~ & 3
+    & \textit{pnaive}  & 0.907 & ~
+    & \textit{hets}    & 0.841 & ~
+    & \textit{hets}    & 0.851 & ~
+    & \textit{rtses}   & 0.857 & ~ \\
+
+\hline
+\multirow{3}{*}{5} & 1
+    & \textbf{\textit{trivial}}
+        & 0.782 & \multirow{3}{*}{\rotatebox{90}{4527}}
+    & \textbf{\textit{hsma}}
+        & 0.844 & \multirow{3}{*}{\rotatebox{90}{3055}}
+    & \textbf{\textit{hsma}}
+        & 0.841 & \multirow{3}{*}{\rotatebox{90}{2662}}
+    & \textbf{\textit{vrfr}}
+        & 0.849 & \multirow{3}{*}{\rotatebox{90}{2019}} \\
+~ & 2
+    & \textit{hsma}             & 0.802 & ~
+    & \textit{hses}             & 0.851 & ~
+    & \textit{hets}             & 0.844 & ~
+    & \textbf{\textit{rtarima}} & 0.851 & ~ \\
+~ & 3
+    & \textit{pnaive}  & 0.888 & ~
+    & \textit{hets}    & 0.863 & ~
+    & \textit{hses}    & 0.845 & ~
+    & \textit{vsvr}    & 0.853 & ~ \\
+
+\hline
+\multirow{3}{*}{6} & 1
+    & \textbf{\textit{trivial}}
+        & 0.743 & \multirow{3}{*}{\rotatebox{90}{4470}}
+    & \textbf{\textit{hsma}}
+        & 0.843 & \multirow{3}{*}{\rotatebox{90}{3086}}
+    & \textbf{\textit{hsma}}
+        & 0.841 & \multirow{3}{*}{\rotatebox{90}{2625}}
+    & \textbf{\textit{vrfr}}
+        & 0.844 & \multirow{3}{*}{\rotatebox{90}{2025}} \\
+~ & 2
+    & \textit{hsma}             & 0.765 & ~
+    & \textit{hses}             & 0.853 & ~
+    & \textit{hses}             & 0.844 & ~
+    & \textbf{\textit{hets}}    & 0.847 & ~ \\
+~ & 3
+    & \textit{pnaive}  & 0.836 & ~
+    & \textit{hets}    & 0.861 & ~
+    & \textit{hets}    & 0.844 & ~
+    & \textit{vsvr}    & 0.849 & ~ \\
+
+\hline
+\multirow{3}{*}{7} & 1
+    & \textbf{\textit{trivial}}
+        & 0.728 & \multirow{3}{*}{\rotatebox{90}{4454}}
+    & \textbf{\textit{hsma}}
+        & 0.855 & \multirow{3}{*}{\rotatebox{90}{3132}}
+    & \textbf{\textit{hets}}
+        & 0.843 & \multirow{3}{*}{\rotatebox{90}{2597}}
+    & \textbf{\textit{hets}}
+        & 0.839 & \multirow{3}{*}{\rotatebox{90}{2007}} \\
+~ & 2
+    & \textit{hsma}          & 0.744 & ~
+    & \textit{hses}          & 0.862 & ~
+    & \textit{hsma}          & 0.845 & ~
+    & \textbf{\textit{vrfr}} & 0.842 & ~ \\
+~ & 3
+    & \textit{pnaive}        & 0.812 & ~
+    & \textit{hets}          & 0.868 & ~
+    & \textbf{\textit{vsvr}} & 0.849 & ~
+    & \textit{vsvr}          & 0.846 & ~ \\
+
+\hline
+\multirow{3}{*}{8} & 1
+    & \textbf{\textit{trivial}}
+        & 0.736 & \multirow{3}{*}{\rotatebox{90}{4402}}
+    & \textbf{\textit{hsma}}
+        & 0.865 & \multirow{3}{*}{\rotatebox{90}{3159}}
+    & \textbf{\textit{hets}}
+        & 0.843 & \multirow{3}{*}{\rotatebox{90}{2575}}
+    & \textbf{\textit{hets}}
+        & 0.837 & \multirow{3}{*}{\rotatebox{90}{2002}} \\
+~ & 2
+    & \textit{hsma}          & 0.759 & ~
+    & \textit{hets}          & 0.874 & ~
+    & \textbf{\textit{vsvr}} & 0.848 & ~
+    & \textbf{\textit{vrfr}} & 0.841 & ~ \\
+~ & 3
+    & \textit{pnaive}  & 0.820 & ~
+    & \textit{hses}    & 0.879 & ~
+    & \textit{hsma}    & 0.850 & ~
+    & \textit{vsvr}    & 0.847 & ~ \\
+
+\hline
+\end{tabular}
+\end{center}
+
+\begin{center}
+\captionof{table}{Ranking of benchmark and horizontal models
+                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
+                  the table shows the ranks for cases with $2.5 < ADD < 25$
+                  (and $25 < ADD < \infty$ in parentheses if they differ)}
+\label{t:hori:a}
+\begin{tabular}{|c|ccc|cccccccc|}
+\hline
+\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
+    & \multicolumn{3}{c|}{\thead{Benchmarks}}
+    & \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\
+\cline{2-12}
+~ & \textit{naive}     & \textit{fnaive}   & \textit{paive}
+  & \textit{harima}    & \textit{hcroston} & \textit{hets} & \textit{hholt}
+  & \textit{hhwinters} & \textit{hses}     & \textit{hsma} & \textit{htheta} \\
+\hline \hline
+3 & 11      &  7 (2) &  8 (5) & 5 (7) & 4     & 3
+  &  9 (10) & 10 (9) &  2 (6) & 1     & 6 (8) \\
+4 & 11      &  7 (2) &  8 (3) & 5 (6) & 4 (5) & 3 (1)
+  &  9 (10) & 10 (9) &  2 (8) & 1 (4) & 6 (7) \\
+5 & 11      &  7 (2) &  8 (4) & 5 (3) & 4 (9) & 3 (1)
+  &  9 (10) & 10 (5) &  2 (8) & 1 (6) & 6 (7) \\
+6 & 11      &  8 (5) &  9 (6) & 5 (4) & 4 (7) & 2 (1)
+  & 10      &  7 (2) &  3 (8) & 1 (9) & 6 (3)  \\
+7 & 11      &  8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
+  &  9 (10) &  7 (2) &  3 (8) & 1 (9) & 6 (3) \\
+8 & 11      &  9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
+  &  8 (10) &  7 (2) &  3 (8) & 1 (9) & 6 (3) \\
+\hline
+\end{tabular}
+\end{center}
+\
+
+\begin{center}
+\captionof{table}{Ranking of classical models on vertical time series
+                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
+                  the table shows the ranks for cases with $2.5 < ADD < 25$
+                  (and $25 < ADD < \infty$ in parentheses if they differ)}
+\label{t:vert:a}
+\begin{tabular}{|c|cc|ccccc|ccccc|}
+\hline
+\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
+    & \multicolumn{2}{c|}{\thead{Benchmarks}}
+    & \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}}
+    & \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\
+\cline{2-13}
+~ & \textit{hets}  & \textit{hsma}   & \textit{varima} & \textit{vets} 
+  & \textit{vholt} & \textit{vses}   & \textit{vtheta} & \textit{rtarima}
+  & \textit{rtets} & \textit{rtholt} & \textit{rtses}  & \textit{rttheta} \\
+\hline \hline
+3 &  2 (10) &  1  (7) & 6 (4) & 8 (6) & 10 (9)
+  &  7  (5) & 11 (12) & 4 (1) & 5 (3) &  9 (8) & 3 (2) & 12 (11) \\
+4 &  2  (7) &  1 (10) & 6 (4) & 8 (6) & 10 (9)
+  &  7  (5) & 12 (11) & 3 (1) & 5 (3) &  9 (8) & 4 (2) & 11 (12) \\
+5 &  2  (3) &  1 (10) & 7 (5) & 8 (7) & 10 (9)
+  &  6      & 11      & 4 (1) & 5 (4) &  9 (8) & 3 (2) & 12 \\
+6 &  2  (1) &  1 (10) & 6 (5) & 8 (7) & 10 (9)
+  &  7  (6) & 11 (12) & 3 (2) & 5 (4) &  9 (8) & 4 (3) & 12 (11) \\
+7 &  2  (1) &  1 (10) & 8 (5) & 7     & 10 (9)
+  &  6      & 11 (12) &	5 (2) & 4     &  9 (8) & 3     & 12 (11) \\
+8 &  2  (1) &  1  (9) & 8 (5) & 7     & 10 (8)
+  &  6      & 12 (10) & 5 (2) & 4     &  9 (6) & 3     & 11 \\
+\hline
+\end{tabular}
+\end{center}
+\
+
+\pagebreak
+
+\begin{center}
+\captionof{table}{Ranking of ML models on vertical time series
+                  ($1~\text{km}^2$ pixel size, 60-minute time steps):
+                  the table shows the ranks for cases with $2.5 < ADD < 25$
+                  (and $25 < ADD < \infty$ in parentheses if they differ)}
+\label{t:ml:a}
+\begin{tabular}{|c|cccc|cc|}
+\hline
+\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
+    & \multicolumn{4}{c|}{\thead{Benchmarks}}
+    & \multicolumn{2}{c|}{\thead{ML}} \\
+\cline{2-7}
+~ & \textit{fnaive}  & \textit{hets} & \textit{hsma}
+  & \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\
+\hline \hline
+3 & 6     & 2 (5) & 1 (3) & 3 (1) & 5 (2) & 4 \\
+4 & 6 (5) & 2 (3) & 1 (6) & 3 (2) & 5 (1) & 4 \\
+5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\
+6 & 6 (5) & 2     & 1 (6) & 4     & 5 (1) & 3 \\
+7 & 6 (5) & 2 (1) & 1 (6) & 4     & 5 (2) & 3 \\
+8 & 6 (5) & 2 (1) & 1 (6) & 4     & 5 (2) & 3 \\
+\hline
+\end{tabular}
+\end{center}
+\
--- a/tex/apx/tabular_ml_models.tex
+++ b/tex/apx/tabular_ml_models.tex
@ -0,0 +1,58 @@
+\section{Tabular and Real-time Forecasts without Retraining}
+\label{tabular_ml_models}
+
+Regarding the structure of the feature matrix for the ML models in Sub-section
+    \ref{ml_models}, we provide an alternative approach that works without
+    the STL method.
+Instead of decomposing a time series and arranging the resulting
+    seasonally-adjusted time series $a_t$ into a matrix $\mat{X}$, one can
+    create a matrix with two types of feature columns mapped to the raw
+    observations in $\vec{y}$:
+While the first group of columns takes all observations of the same time of
+    day over a horizon of, for example, one week ($n_h=7$), the second group
+    takes all observations covering a pre-defined time horizon, for example
+    $3$ hours ($n_r=3$ for 60-minute time steps), preceding the time step to
+    be fitted.
+Thus, we exploit the two-dimensional structure of time tables as well, and
+    conceptually model historical and recent demand.
+The alternative feature matrix appears as follows where the first three
+    columns are the historical and the last three the recent demand features:
+
+$$
+\vec{y}
+=
+\begin{pmatrix}
+    y_T \\
+    y_{T-1} \\
+    \dots \\
+    y_{1+n_hH}
+\end{pmatrix}
+~~~~~
+\mat{X}
+=
+\begin{bmatrix}
+    y_{T-H}              & y_{T-2H}       & \dots & y_{T-n_hH}
+        & y_{T-1}        & y_{T-2}        & \dots & y_{T-n_r} \\
+    y_{T-1-H}            & y_{T-1-2H}     & \dots & y_{T-1-n_hH}
+        & y_{T-2}        & y_{T-3}        & \dots & y_{T-n_r-1} \\
+    \dots                & \dots          & \dots & \dots
+        & \dots          & \dots          & \dots & \dots \\
+    y_{1+(n_h-1)H}       & y_{1+(n_h-2)H} & \dots & y_1
+        & y^*_{1+n_hH-1} & y^*_{1+n_hH-2} & \dots & y^*_{1+n_hH-n_r}
+\end{bmatrix}
+$$
+\
+
+Being a detail, we note that the recent demand features lying on the end of
+    the previous day are set to $0$, which is shown with the $^*$ notation
+    above.
+This alignment of the undecomposed order data $y_t$ ensures that the ML
+    models learn the two seasonal patterns independently.
+The parameters $n_h$ and $n_r$ must be adapted to the data, but we found the
+    above values to work well.
+
+As such matrices resemble time tables, we refer to them as tabular.
+However, we found the ML models with vertical time series to outperform the
+    tabular ML models, which is why we disregarded them in the study.
+This tabular form could be beneficial for UDPs with a demand that exhibits
+    a weaker seasonality such as a meal delivery platform.
--- a/tex/glossary.tex
+++ b/tex/glossary.tex
@ -5,6 +5,9 @@
 \newglossaryentry{cv}{
    name=CV, description={Cross Validation}
 }
+\newglossaryentry{mase}{
+    name=MASE, description={Mean Absolute Scaled Error}
+}
 \newglossaryentry{ml}{
    name=ML, description={Machine Learning}
 }
@ -27,4 +30,103 @@
    name=VRP, description={Vehicle Routing Problem}
 }

+% Model names.
+\newglossaryentry{naive}{
+    name=naive, description={(Seasonal) Na\"{i}ve Method}
+}
+\newglossaryentry{fnaive}{
+    name=fnaive, description={"Flexible" STL Decomposition,
+                              with tuned ns parameter}
+}
+\newglossaryentry{pnaive}{
+    name=pnaive, description={"Periodic" STL Decomposition,
+                              with ns parameter set to large number}
+}
+\newglossaryentry{trivial}{
+    name=trivial, description={Trivial Method}
+}
+\newglossaryentry{hcroston}{
+    name=hcroston, description={Croston's Method,
+                                trained on horizontal time series}
+}
+\newglossaryentry{hholt}{
+    name=hholt, description={Holt's Linear Trend Method,
+                             trained on horizontal time series}
+}
+\newglossaryentry{vholt}{
+    name=vholt, description={Holt's Linear Trend Method,
+                             trained on vertical time series}
+}
+\newglossaryentry{rtholt}{
+    name=rtholt, description={Holt's Linear Trend Method,
+                              (re)trained on vertical time series}
+}
+\newglossaryentry{hhwinters}{
+    name=hhwinters, description={Holt-Winter's Seasonal Method,
+                                 trained on horizontal time series}
+}
+\newglossaryentry{hses}{
+    name=hses, description={Simple Exponential Smoothing Method,
+                            trained on horizontal time series}
+}
+\newglossaryentry{vses}{
+    name=vses, description={Simple Exponential Smoothing Method,
+                            trained on vertical time series}
+}
+\newglossaryentry{rtses}{
+    name=rtses, description={Simple Exponential Smoothing Method,
+                             (re)trained on vertical time series}
+}
+\newglossaryentry{hsma}{
+    name=hsma, description={Simple Moving Average Method,
+                            trained on horizontal time series}
+}
+\newglossaryentry{htheta}{
+    name=htheta, description={Theta Method,
+                              trained on horizontal time series}
+}
+\newglossaryentry{vtheta}{
+    name=vtheta, description={Theta Method,
+                              trained on vertical time series}
+}
+\newglossaryentry{rttheta}{
+    name=rttheta, description={Theta Method,
+                               (re)trained on vertical time series}
+}
+\newglossaryentry{hets}{
+    name=hets, description={ETS State Space Method,
+                            trained on horizontal time series}
+}
+\newglossaryentry{vets}{
+    name=vets, description={ETS State Space Method,
+                            trained on vertical time series}
+}
+\newglossaryentry{rtets}{
+    name=rtets, description={ETS State Space Method,
+                             (re)trained on vertical time series}
+}
+\newglossaryentry{harima}{
+    name=harima, description={Autoregressive Integrated Moving Average
+                              Method,
+                              trained on horizontal time series}
+}
+\newglossaryentry{varima}{
+    name=varima, description={Autoregressive Integrated Moving Average
+                              Method,
+                              trained on vertical time series}
+}
+\newglossaryentry{rtarima}{
+    name=rtarima, description={Autoregressive Integrated Moving Average
+                               Method,
+                               (re)trained on vertical time series}
+}
+\newglossaryentry{vrfr}{
+    name=vrfr, description={Random Forest Regression Method,
+                            trained on vertical time series}
+}
+\newglossaryentry{vsvr}{
+    name=vsvr, description={Support Vector Regression Method,
+                            trained on vertical time series}
+}
+
 \printglossaries
--- a/tex/preamble.tex
+++ b/tex/preamble.tex
@ -4,6 +4,12 @@
 \usepackage[acronym]{glossaries}
 \makeglossaries

+% Enable captions for figures and tables.
+\usepackage{caption}
+
+% Enable diagonal lines in tables.
+\usepackage{static/slashbox}
+
 % Make opening quotes look different than closing quotes.
 \usepackage[english=american]{csquotes}
 \MakeOuterQuote{"}
--- a/tex/references.bib
+++ b/tex/references.bib
@ -26,6 +26,16 @@ volume={1},
 pages={461--466}
 }

+@article{barbour2014,
+title={psd: Adaptive, sine multitaper power spectral density estimation for R},
+author={Barbour, Andrew J and Parker, Robert L},
+year={2014},
+journal={Computers \& Geosciences},
+volume={63},
+pages={1--8},
+publisher={Elsevier}
+}
+
@misc{bell2018,
 title = {Forecasting at Uber: An Introduction},
 author={Bell, Franziska and Smyl, Slawek},
@ -139,6 +149,16 @@ number={1},
 pages={3--73}
 }

+@article{croston1972,
+title={Forecasting and Stock Control for intermittent Demands},
+author={Croston, J Do},
+year={1972},
+journal={Journal of the Operational Research Society},
+volume={23},
+number={3},
+pages={289--303}
+}
+
@book{dagum2016,
 title={Seasonal Adjustment Methods and Real Time Trend-Cycle Estimation},
 author={Dagum, Estela and Bianconcini, Silvia},
@ -275,6 +295,17 @@ number={2},
 pages={287--290}
 }

+@article{hyndman2006,
+title={Another Look at Measures of Forecast Accuracy},
+author={Hyndman, Rob and Koehler, Anne},
+year={2006},
+journal={International Journal of Forecasting},
+volume={22},
+number={4},
+pages={679--688},
+publisher={Elsevier}
+}
+
@article{hyndman2008a,
 title={Automatic Time Series Forecasting: The forecast package for R},
 author={Hyndman, Rob and Khandakar, Yeasmin},
@ -298,6 +329,18 @@ year={2018},
 publisher={OTexts}
 }

+@article{kim2016,
+title={A new Metric of Absolute Percentage Error for Intermittent Demand
+       Forecasts},
+author={Kim, Sungil and Kim, Heeyoung},
+year={2016},
+journal={International Journal of Forecasting},
+volume={32},
+number={3},
+pages={669--679},
+publisher={Elsevier}
+}
+
@article{kwiatkowski1992,
 title={Testing the null hypothesis of stationarity against the alternative of a
       unit root: How sure are we that economic time series have a unit root?},
@ -319,6 +362,17 @@ howpublished = {\url{https://eng.uber.com/neural-networks/}},
 note = {Accessed: 2020-10-01}
 }

+@article{ma2018,
+title={Using the Gradient Boosting Decision Tree to Improve the Delineation of
+       Hourly Rain Areas during the Summer from Advanced Himawari Imager Data},
+author={Ma, Liang and Zhang, Guoping and Lu, Er},
+year={2018},
+journal={Journal of Hydrometeorology},
+volume={19},
+number={5},
+pages={761-776},
+}
+
@article{masmoudi2018,
 title={The dial-a-ride problem with electric vehicles and battery
       swapping stations},
@ -330,6 +384,15 @@ volume={118},
 pages={392--420}
 }

+@inproceedings{mason2000,
+title={Boosting algorithms as gradient descent},
+author={Mason, Llew and Baxter, Jonathan and Bartlett, Peter L
+        and Frean, Marcus R},
+year={2000},
+booktitle={Advances in neural information processing systems},
+pages={512--518}
+}
+
@inproceedings{mueller1997,
 title={Predicting Time Series with Support Vector Machines},
 author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar
@ -367,6 +430,18 @@ number={5},
 pages={311--315}
 }

+@article{prestwich2014,
+title={Mean-based Error Measures for Intermittent Demand Forecasting},
+author={Prestwich, Steven and Rossi, Roberto and Tarim, Armagan
+        and Hnich, Brahim},
+year={2014},
+journal={International Journal of Production Research},
+volume={52},
+number={22},
+pages={6782--6791},
+publisher={Taylor \& Francis}
+}
+
@incollection{scholkopf1998,
 title={Fast Approximation of Support Vector Kernel Expansions, and an
       Interpretation of Clustering as Approximation in Feature Spaces},
@ -378,6 +453,13 @@ publisher={Springer},
 pages={125--132}
 }

+@book{singleton2017,
+title={Urban Analytics},
+author={Singleton, Alex David and Spielman, Seth and Folch, David},
+year={2017},
+publisher={Sage}
+}
+
@article{smola2004,
 title={A Tutorial on Support Vector Regression},
 author={Smola, Alex and Sch{\"o}lkopf, Bernhard},
@ -398,6 +480,17 @@ pages={285--292},
 publisher={MIT, Cambridge, MA, USA}
 }

+@article{syntetos2005,
+title={The Accuracy of Intermittent Demand Estimates},
+author={Syntetos, Aris and Boylan, John},
+year={2005},
+journal={International Journal of forecasting},
+volume={21},
+number={2},
+pages={303--314},
+publisher={Elsevier}
+}
+
@article{taylor2003,
 title={Exponential Smoothing with a Damped Multiplicative Trend},
 author={Taylor, James},
@ -442,6 +535,18 @@ volume={118},
 pages={496--512}
 }

+@article{winkenbach2015,
+title={Enabling urban logistics services at La Poste through
+       multi-echelon location-routing},
+author={Winkenbach, Matthias and Kleindorfer, Paul R and Spinler, Stefan},
+year={2015},
+journal={Transportation Science},
+volume={50},
+number={2},
+pages={520--540},
+publisher={INFORMS}
+}
+
@article{winters1960,
 title={Forecasting Sales by Exponentially Weighted Moving Averages},
 author={Winters, Peter},