1
0
Fork 0

Merge branch 'model-section' into develop

This commit is contained in:
Alexander Hess 2020-10-04 23:42:50 +02:00
commit 20abf8eade
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
26 changed files with 1354 additions and 6 deletions

BIN
paper.pdf

Binary file not shown.

View file

@ -19,6 +19,16 @@
\input{tex/2_lit/3_ml/4_rf}
\input{tex/2_lit/3_ml/5_svm}
\input{tex/3_mod/1_intro}
\input{tex/3_mod/2_overall}
\input{tex/3_mod/3_grid}
\input{tex/3_mod/4_cv}
\input{tex/3_mod/5_mase}
\input{tex/3_mod/6_decomp}
\input{tex/3_mod/7_models/1_intro}
\input{tex/3_mod/7_models/2_hori}
\input{tex/3_mod/7_models/3_vert}
\input{tex/3_mod/7_models/4_rt}
\input{tex/3_mod/7_models/5_ml}
\input{tex/4_stu/1_intro}
\input{tex/5_con/1_intro}
@ -29,6 +39,10 @@
\appendix
\newpage
\input{tex/apx/tabular_ml_models}
\newpage
\input{tex/apx/enhanced_feats}
\newpage
\bibliographystyle{static/elsarticle-harv}
\bibliography{tex/references}

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 261 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

77
static/slashbox.sty Normal file
View file

@ -0,0 +1,77 @@
% slashbox.sty by Koichi Yasuoka, May 27, 1993
% minor modification by Toru Sato, May 31, 1993
\typeout{slashbox style by K.Yasuoka, May 1993.}%
\newbox\@slashboxa
\newbox\@slashboxb
\newbox\@slashboxc
\newcount\@slashboxwd
\newcount\@slashboxht
\newdimen\@slashsepl
\newdimen\@slashsepr
\def\slashbox{%
\def\@slashboxpicture##1{%
\put(0,0){\line(##1,1){\@slashboxwd}}%
\put(0,\@slashboxht){\makebox(0,0)[tl]{\box\@slashboxa}}%
\put(\@slashboxwd,0){\makebox(0,0)[br]{\box\@slashboxb}}%
}%
\@slashbox
}%
\def\backslashbox{%
\def\@slashboxpicture##1{%
\put(0,\@slashboxht){\line(##1,-1){\@slashboxwd}}%
\put(0,0){\makebox(0,0)[bl]{\box\@slashboxa}}%
\put(\@slashboxwd,\@slashboxht){\makebox(0,0)[tr]{\box\@slashboxb}}%
}%
\@slashbox
}%
\def\@slashbox{\@ifnextchar [{\@@slashbox}{\@@slashbox[0pt]}}
\def\@@slashbox[#1]{\@ifnextchar [{\@@@slashbox[#1]}{\@@@slashbox[#1][c]}}
\def\@@@slashbox[#1][#2]#3#4{%
% #1: width, #2: suppression of \tabcolsep on `l', `r', or `lr' side
% #3: left item, #4: right item
\@slashsepl=\tabcolsep
\@slashsepr=\tabcolsep
\@tfor\@tempa :=#2\do{\expandafter\let
\csname @slashsep\@tempa\endcsname=\z@}%
\setbox\@slashboxa=\hbox{\strut\hskip\tabcolsep\shortstack[l]{#3}}%
\setbox\@slashboxb=\hbox{\shortstack[r]{#4}\hskip\tabcolsep\strut}%
\setbox\@slashboxa=\hbox{\raise\dp\@slashboxa\box\@slashboxa}%
\setbox\@slashboxb=\hbox{\raise\dp\@slashboxb\box\@slashboxb}%
\setbox\@slashboxc=\hbox{%
\@tempdima=\wd\@slashboxa
\advance\@tempdima by \wd\@slashboxb
\advance\@tempdima by \@slashsepl
\advance\@tempdima by \@slashsepr
\@tempdimb=#1\relax%
\ifdim\@tempdimb>\@tempdima \@tempdima=\@tempdimb\fi%
\@tempdimb=\ht\@slashboxa
\advance\@tempdimb by \dp\@slashboxa
\advance\@tempdimb by \ht\@slashboxb
\advance\@tempdimb by \dp\@slashboxb
\@tempcnta=\@tempdima
\@tempcntb=\@tempdimb
\advance\@tempcnta by \@tempcntb
\advance\@tempcnta by -1
\divide\@tempcnta by \@tempcntb
\ifnum\@tempcnta>6 \@tempcnta=6
\@tempdimb=0.166666666\@tempdima
\else
\ifnum\@tempcnta<1 \@tempcnta=1\fi
\@tempdima=\@tempdimb
\multiply\@tempdima by \@tempcnta
\fi%
\advance\@tempdima by -\@slashsepl
\advance\@tempdima by -\@slashsepr
\@slashboxwd=\@tempdima
\@slashboxht=\@tempdimb
\@tempcntb=\@slashsepl
\setlength{\unitlength}{1sp}%
\begin{picture}(\@slashboxwd,\@slashboxht)(\@tempcntb,0)
\advance\@tempdima by \@slashsepl
\advance\@tempdima by \@slashsepr
\@slashboxwd=\@tempdima
\@slashboxpicture{\@tempcnta}
\end{picture}%
}%
$\vcenter{\box\@slashboxc}$%
}%

BIN
static/stl_gray.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

View file

@ -1,8 +1,6 @@
\section{Model Formulation}
\label{mod}
% temporary placeholders
\label{decomp}
\label{f:stl}
\label{mase}
\label{unified_cv}
In this section, we describe how the platform's raw data are pre-processed
into model inputs and how the forecasting models are built and benchmarked
against each other.

28
tex/3_mod/2_overall.tex Normal file
View file

@ -0,0 +1,28 @@
\subsection{Overall Approach}
\label{approach_approach}
On a conceptual level, there are three distinct aspects of the model
development process.
First, a pre-processing step transforms the platform's tabular order data into
either time series in Sub-section \ref{grid} or feature matrices in
Sub-section \ref{ml_models}.
Second, a benchmark methodology is developed in Sub-section \ref{unified_cv}
that compares all models on the same scale, in particular, classical
models with ML ones.
Concretely, the CV approach is adapted to the peculiar requirements of
sub-daily and ordinal time series data.
This is done to maximize the predictive power of all models into the future
and to compare them on the same scale.
Third, the forecasting models are described with respect to their assumptions
and training requirements.
Four classification dimensions are introduced:
\begin{enumerate}
\item \textbf{Timeliness of the Information}:
whole-day-ahead vs. real-time forecasts
\item \textbf{Time Series Decomposition}: raw vs. decomposed
\item \textbf{Algorithm Type}: "classical" statistics vs. ML
\item \textbf{Data Sources}: pure vs. enhanced (i.e., with external data)
\end{enumerate}
Not all of the possible eight combinations are implemented; instead, the
models are varied along these dimensions to show different effects and
answer the research questions.

95
tex/3_mod/3_grid.tex Normal file
View file

@ -0,0 +1,95 @@
\subsection{Gridification, Time Tables, and Time Series Generation}
\label{grid}
The platform's tabular order data are sliced with respect to both location and
time and then aggregated into time series where an observation tells
the number of orders in an area for a time step/interval.
Figure \ref{f:grid} shows how the orders' delivery locations are each
matched to a square-shaped cell, referred to as a pixel, on a grid
covering the entire service area within a city.
This gridification step is also applied to the pickup locations separately.
The lower-left corner is chosen at random.
\cite{winkenbach2015} apply the same gridification idea and slice an urban
area to model a location-routing problem, and \cite{singleton2017} portray
it as a standard method in the field of urban analytics.
With increasing pixel sizes, the time series exhibit more order aggregation
with a possibly stronger demand pattern.
On the other hand, the larger the pixels, the less valuable become the
generated forecasts as, for example, a courier sent to a pixel
preemptively then faces a longer average distance to a restaurant in the
pixel.
\begin{center}
\captionof{figure}{Gridification for delivery locations in Paris with a pixel
size of $1~\text{km}^2$}
\label{f:grid}
\includegraphics[width=.8\linewidth]{static/gridification_for_paris_gray.png}
\end{center}
After gridification, the ad-hoc orders within a pixel are aggregated by their
placement timestamps into sub-daily time steps of pre-defined lengths
to obtain a time table as exemplified in Figure \ref{f:timetable} with
one-hour intervals.
\begin{center}
\captionof{figure}{Aggregation into a time table with hourly time steps}
\label{f:timetable}
\begin{tabular}{|c||*{9}{c|}}
\hline
\backslashbox{Time}{Day} & \makebox[2em]{\ldots}
& \makebox[3em]{Mon} & \makebox[3em]{Tue}
& \makebox[3em]{Wed} & \makebox[3em]{Thu}
& \makebox[3em]{Fri} & \makebox[3em]{Sat}
& \makebox[3em]{Sun} & \makebox[2em]{\ldots} \\
\hline
\hline
11:00 & \ldots & $y_{11,Mon}$ & $y_{11,Tue}$ & $y_{11,Wed}$ & $y_{11,Thu}$
& $y_{11,Fri}$ & $y_{11,Sat}$ & $y_{11,Sun}$ & \ldots \\
\hline
12:00 & \ldots & $y_{12,Mon}$ & $y_{12,Tue}$ & $y_{12,Wed}$ & $y_{12,Thu}$
& $y_{12,Fri}$ & $y_{12,Sat}$ & $y_{12,Sun}$ & \ldots \\
\hline
\ldots & \ldots & \ldots & \ldots & \ldots
& \ldots & \ldots & \ldots & \ldots & \ldots \\
\hline
20:00 & \ldots & $y_{20,Mon}$ & $y_{20,Tue}$ & $y_{20,Wed}$ & $y_{20,Thu}$
& $y_{20,Fri}$ & $y_{20,Sat}$ & $y_{20,Sun}$ & \ldots \\
\hline
21:00 & \ldots & $y_{21,Mon}$ & $y_{21,Tue}$ & $y_{21,Wed}$ & $y_{21,Thu}$
& $y_{21,Fri}$ & $y_{21,Sat}$ & $y_{21,Sun}$ & \ldots \\
\hline
\ldots & \ldots & \ldots & \ldots & \ldots
& \ldots & \ldots & \ldots & \ldots & \ldots \\
\hline
\end{tabular}
\end{center}
\
Consequently, each $y_{t,d}$ in Figure \ref{f:timetable} is the number of
all orders within the pixel for the time of day $t$ and day of week
$d$ ($y_t$ and $y_{t,d}$ are the same but differ in that the latter
acknowledges a 2D view).
The same trade-off as with gridification applies:
The shorter the interval, the weaker is the demand pattern to be expected in
the time series due to less aggregation while longer intervals lead to
less usable forecasts.
We refer to time steps by their start time, and their number per day, $H$,
is constant.
Given a time table as in Figure \ref{f:timetable} there are two ways to
generate a time series by slicing:
\begin{enumerate}
\item \textbf{Horizontal View}:
Take only the order counts for a given time of the day
\item \textbf{Vertical View}:
Take all order counts and remove the double-seasonal pattern induced
by the weekday and time of the day with decomposition
\end{enumerate}
Distinct time series are retrieved by iterating through the time tables either
horizontally or vertically in increments of a single time step.
Another property of a generated time series is its length, which, following
the next sub-section, can be interpreted as the sum of the production
training set and the test day.
In summary, a distinct time series is generated from the tabular order data
based on a configuration of parameters for the dimensions pixel size,
number of daily time steps $H$, shape (horizontal vs. vertical), length,
and the time step to be predicted.

86
tex/3_mod/4_cv.tex Normal file
View file

@ -0,0 +1,86 @@
\subsection{Unified Cross-Validation and Training, Validation, and Test Sets}
\label{unified_cv}
The standard $k$-fold CV, which assumes no structure in the individual
features of the samples, as shown in $\mat{X}$ above, is adapted to the
ordinal character of time series data:
A model must be evaluated on observations that occurred strictly after the
ones used for training as, otherwise, the model knows about the future.
Furthermore, some models predict only a single to a few time steps before
being retrained, while others predict an entire day without retraining
(cf., Sub-section \ref{ml_models}).
Consequently, we must use a unified time interval wherein all forecasts are
made first before the entire interval is evaluated.
As whole days are the longest prediction interval for models without
retraining, we choose that as the unified time interval.
In summary, our CV methodology yields a distinct best model per pixel and day
to be forecast.
Whole days are also practical for managers who commonly monitor, for example,
the routing and thus the forecasting performance on a day-to-day basis.
Our methodology assumes that the models are trained at least once per day.
As we create operational forecasts into the near future in this paper,
retraining all models with the latest available data is a logical step.
\begin{center}
\captionof{figure}{Training, validation, and test sets
during cross validation}
\label{f:cv}
\includegraphics[width=.8\linewidth]{static/cross_validation_gray.png}
\end{center}
The training, validation, and test sets are defined as follows.
To exemplify the logic, we refer to Figure \ref{f:cv} that shows the calendar
setup (i.e., weekdays on the x-axis) for three days $T_1$, $T_2$, and
$T_3$ (shown in dark gray) for which we generate forecasts.
Each of these days is, by definition, a test day, and the test set comprises
all time series, horizontal or vertical, whose last observation lies on
that day.
With an assumed training horizon of three weeks, the 21 days before each of
the test days constitute the corresponding training sets (shown in lighter
gray on the same rows as $T_1$, $T_2$, and $T_3$).
There are two kinds of validation sets, depending on the decision to be made.
First, if a forecasting method needs parameter tuning, the original training
set is divided into as many equally long series as validation days are
needed to find stable parameters.
The example shows three validation days per test day named $V_n$ (shown
in darker gray below each test day).
The $21 - 3 = 18$ preceding days constitute the training set corresponding to
a validation day.
To obtain the overall validation error, the three errors are averaged.
We call these \textit{inner} validation sets because they must be repeated
each day to re-tune the parameters and because the involved time series
are true subsets of the original series.
Second, to find the best method per day and pixel, the same averaging logic
is applied on the outer level.
For example, if we used two validation days to find the best method for $T_3$,
we would average the errors of $T_1$ and $T_2$ for each method and select
the winner; then, $T_1$ and $T_2$ constitute an \textit{outer} validation
set.
Whereas the number of inner validation days is method-specific and must be
chosen before generating any test day forecasts in the first place, the
number of outer validation days may be varied after the fact and is
determined empirically as we show in Section \ref{stu}.
Our unified CV approach is also optimized for large-scale production settings,
for example, at companies like Uber.
As \cite{bell2018} note, there is a trade-off as to when each of the
inner time series in the example begins.
While the forecasting accuracy likely increases with more training days,
supporting inner series with increasing lengths, cutting the series
to the same length allows caching the forecasts and errors.
In the example, $V_3$, $V_5$, and $V_7$, as well as $V_6$ and $V_8$ are
identical despite belonging to different inner validation sets.
Caching is also possible on the outer level when searching for an optimal
number of validation days for model selection.
We achieved up to 80\% cache hit ratios in our implementation in the
empirical study, thereby saving computational resources by the same
amount.
Lastly, we assert that our suggested CV, because of its being unified
around whole test days and usage of fix-sized time series, is also
suitable for creating consistent learning curves and, thus, answering
\textbf{Q3} on the relationship between forecast accuracy and amount of
historic data:
We simply increase the length of the outer training set holding the test day
fixed.
Thus, independent of a method's need for parameter tuning, all methods have
the same demand history available for each test day forecast.

87
tex/3_mod/5_mase.tex Normal file
View file

@ -0,0 +1,87 @@
\subsection{Accuracy Measures}
\label{mase}
Choosing an error measure for both model selection and evaluation is not
straightforward when working with intermittent demand, as shown, for
example, by \cite{syntetos2005}, and one should understand the trade-offs
between measures.
\cite{hyndman2006} provide a study of measures with real-life data taken from
the popular M3-competition and find that most standard measures degenerate
under many scenarios.
They also provide a classification scheme for which we summarize the main
points as they apply to the UDP case:
\begin{enumerate}
\item \textbf{Scale-dependent Errors}:
The error is reported in the same unit as the raw data.
Two popular examples are the root mean square error (RMSE) and mean absolute
error (MAE).
They may be used for model selection and evaluation within a pixel, and are
intuitively interpretable; however, they may not be used to compare errors
of, for example, a low-demand pixel (e.g., at the UDP's service
boundary) with that of a high-demand pixel (e.g., downtown).
\item \textbf{Percentage Errors}:
The error is derived from the percentage errors of individual forecasts per
time step, and is also intuitively interpretable.
A popular example is the mean absolute percentage error (MAPE) that is the
primary measure in most forecasting competitions.
Whereas such errors could be applied both within and across pixels, they
cannot be calculated reliably for intermittent demand.
If only one time step exhibits no demand, the result is a divide-by-zero
error.
This often occurs even in high-demand pixels due to the slicing.
\item \textbf{Relative Errors}:
A workaround is to calculate a scale-dependent error for the test day and
divide it by the same measure calculated with forecasts of a simple
benchmark method (e.g., na\"{i}ve method).
An example could be
$\text{RelMAE} = \text{MAE} / \text{MAE}_\text{bm}$.
Nevertheless, even simple methods create (near-)perfect forecasts, and then
$\text{MAE}_\text{bm}$ becomes (close to) $0$.
These numerical instabilities occurred so often in our studies that we argue
against using such measures.
\item \textbf{Scaled Errors}:
\cite{hyndman2006} contribute this category and introduce the mean absolute
scaled error (\gls{mase}).
It is defined as the MAE from the actual forecasting method on the test day
(i.e., "out-of-sample") divided by the MAE from the (seasonal) na\"{i}ve
method on the entire training set (i.e., "in-sample").
A MASE of $1$ indicates that a forecasting method has the same accuracy
on the test day as the (seasonal) na\"{i}ve method applied on a longer
horizon, and lower values imply higher accuracy.
Within a pixel, its results are identical to the ones obtained with MAE.
Also, we acknowledge recent publications, for example, \cite{prestwich2014} or
\cite{kim2016}, showing other ways of tackling the difficulties mentioned.
However, only the MASE provided numerically stable results for all
forecasts in our study.
\end{enumerate}
Consequently, we use the MASE with a seasonal na\"{i}ve benchmark as the
primary measure in this paper.
With the previously introduced notation, it is defined as follows:
$$
\text{MASE}
:=
\frac{\text{MAE}_{\text{out-of-sample}}}{\text{MAE}_{\text{in-sample}}}
=
\frac{\text{MAE}_{\text{forecasts}}}{\text{MAE}_{\text{training}}}
=
\frac{\frac{1}{H} \sum_{h=1}^H |y_{T+h} - \hat{y}_{T+h}|}
{\frac{1}{T-k} \sum_{t=k+1}^T |y_{t} - y_{t-k}|}
$$
The denominator can only become $0$ if the seasonal na\"{i}ve benchmark makes
a perfect forecast on each day in the training set except the first seven
days, which never happened in our case study involving hundreds of
thousands of individual model trainings.
Further, as per the discussion in the subsequent Section \ref{decomp}, we also
calculate peak-MASEs where we leave out the time steps of non-peak times
from the calculations.
For this analysis, we define all time steps that occur at lunch (i.e., noon to
2 pm) and dinner time (i.e., 6 pm to 8 pm) as peak.
As time steps in non-peak times typically average no or very low order counts,
a UDP may choose to not actively forecast these at all and be rather
interested in the accuracies of forecasting methods during peaks only.
We conjecture that percentage error measures may be usable for UDPs facing a
higher overall demand with no intra-day down-times in between but have to
leave that to a future study.
Yet, even with high and steady demand, divide-by-zero errors are likely to
occur.

76
tex/3_mod/6_decomp.tex Normal file
View file

@ -0,0 +1,76 @@
\subsection{Time Series Decomposition}
\label{decomp}
Concerning the time table in Figure \ref{f:timetable}, a seasonal demand
pattern is inherent to both horizontal and vertical time series.
First, the weekday influences if people eat out or order in with our partner
receiving more orders on Thursday through Saturday than the other four
days.
This pattern is part of both types of time series.
Second, on any given day, demand peaks occur around lunch and dinner times.
This only regards vertical series.
Statistical analyses show that horizontally sliced time series indeed exhibit
a periodicity of $k=7$, and vertically sliced series only yield a seasonal
component with a regular pattern if the periodicity is set to the product
of the number of weekdays and the daily time steps indicating a distinct
intra-day pattern per weekday.
Figure \ref{f:stl} shows three exemplary STL decompositions for a
$1~\text{km}^2$ pixel and a vertical time series with 60-minute time steps
(on the x-axis) covering four weeks:
With the noisy raw data $y_t$ on the left, the seasonal and trend components,
$s_t$ and $t_t$, are depicted in light and dark gray for increasing $ns$
parameters.
The plots include (seasonal) na\"{i}ve forecasts for the subsequent test day
as dotted lines.
The remainder components $r_t$ are not shown for conciseness.
The periodicity is set to $k = 7 * 12 = 84$ as our industry partner has $12$
opening hours per day.
\begin{center}
\captionof{figure}{STL decompositions for a medium-demand pixel with hourly
time steps and periodicity $k=84$}
\label{f:stl}
\includegraphics[width=.95\linewidth]{static/stl_gray.png}
\end{center}
As described in Sub-section \ref{stl}, with $k$ being implied by the
application, at the very least, the length of the seasonal smoothing
window, represented by the $ns$ parameter, must be calibrated by the
forecaster:
It controls how many past observations go into each smoothened $s_t$.
Many practitioners, however, skip this step and set $ns$ to a big number, for
example, $999$, then referred to as "periodic."
For the other parameters, it is common to use the default values as
specified in \cite{cleveland1990}.
The goal is to find a decomposition with a regular pattern in $s_t$.
In Figure \ref{f:stl}, this is not true for $ns=7$ where, for
example, the four largest bars corresponding to the same time of day a
week apart cannot be connected by an approximately straight line.
On the contrary, a regular pattern in the most extreme way exists for
$ns=999$, where the same four largest bars are of the same height.
This observation holds for each time step of the day.
For $ns=11$, $s_t$ exhibits a regular pattern whose bars adapt over time:
The pattern is regular as bars corresponding to the same time of day can be
connected by approximately straight lines, and it is adaptive as these
lines are not horizontal.
The trade-off between small and large values for $ns$ can thus be interpreted
as allowing the average demand during peak times to change over time:
If demand is intermittent at non-peak times, it is reasonable to expect the
bars to change over time as only the relative differences between peak and
non-peak times impact the bars' heights with the seasonal component being
centered around $0$.
To confirm the goodness of a decomposition statistically, one way is to verify
that $r_t$ can be modeled as a typical error process like white noise
$\epsilon_t$.
However, we suggest an alternative way of calibrating the STL method in an
automated fashion based on our unified CV approach.
As hinted at in Figure \ref{f:stl}, we interpret an STL decomposition as a
forecasting method on its own by just adding the (seasonal) na\"{i}ve
forecasts for $s_t$ and $t_t$ and predicting $0$ for $r_t$.
Then, the $ns$ parameter is tuned just like a parameter for an ML model.
To the best of our knowledge, this has not yet been proposed before.
Conceptually, forecasting with the STL method can be viewed as a na\"{i}ve
method with built-in smoothing, and it outperformed all other
benchmark methods in all cases.

View file

@ -0,0 +1,20 @@
\subsection{Forecasting Models}
\label{models}
This sub-section describes the concrete models in our study.
Figure \ref{f:inputs} shows how we classify them into four families with
regard to the type of the time series, horizontal or vertical, and the
moment at which a model is trained:
Solid lines indicate that the corresponding time steps lie before the
training, and dotted lines show the time horizon predicted by a model.
For conciseness, we only show the forecasts for one test day.
The setup is the same for each inner validation day.
\
\begin{center}
\captionof{figure}{Classification of the models by input type and training
moment}
\label{f:inputs}
\includegraphics[width=.95\linewidth]{static/model_inputs_gray.png}
\end{center}

View file

@ -0,0 +1,42 @@
\subsubsection{Horizontal and Whole-day-ahead Forecasts.}
\label{hori}
The upper-left in Figure \ref{f:inputs} illustrates the simplest way to
generate forecasts for a test day before it has started:
For each time of the day, the corresponding horizontal slice becomes the input
for a model.
With whole days being the unified time interval, each model is trained $H$
times, providing a one-step-ahead forecast.
While it is possible to have models of a different type be selected per time
step, that did not improve the accuracy in the empirical study.
As the models in this family do not include the test day's demand data in
their training sets, we see them as benchmarks to answer \textbf{Q4},
checking if a UDP can take advantage of real-time information.
The models in this family are as follows; we use prefixes, such as "h" here,
when methods are applied in other families as well:
\begin{enumerate}
\item \textit{\gls{naive}}:
Observation from the same time step one week prior
\item \textit{\gls{trivial}}:
Predict $0$ for all time steps
\item \textit{\gls{hcroston}}:
Intermittent demand method introduced by \cite{croston1972}
\item \textit{\gls{hholt}},
\textit{\gls{hhwinters}},
\textit{\gls{hses}},
\textit{\gls{hsma}}, and
\textit{\gls{htheta}}:
Exponential smoothing without calibration
\item \textit{\gls{hets}}:
ETS calibrated as described by \cite{hyndman2008b}
\item \textit{\gls{harima}}:
ARIMA calibrated as described by \cite{hyndman2008a}
\end{enumerate}
\textit{naive} and \textit{trivial} provide an absolute benchmark for the
actual forecasting methods.
\textit{hcroston} is often mentioned in the context of intermittent demand;
however, the method did not perform well at all.
Besides \textit{hhwinters} that always fits a seasonal component, the
calibration heuristics behind \textit{hets} and \textit{harima} may do so
as well.
With $k=7$, an STL decomposition is unnecessary here.

View file

@ -0,0 +1,39 @@
\subsubsection{Vertical and Whole-day-ahead Forecasts without Retraining.}
\label{vert}
The upper-right in Figure \ref{f:inputs} shows an alternative way to
generate forecasts for a test day before it has started:
First, a seasonally-adjusted time series $a_t$ is obtained from a vertical
time series by STL decomposition.
Then, the actual forecasting model, trained on $a_t$, makes an $H$-step-ahead
prediction.
Lastly, we add the $H$ seasonal na\"{i}ve forecasts for the seasonal component
$s_t$ to them to obtain the actual predictions for the test day.
Thus, only one training is required per model type, and no real-time data is
used.
By decomposing the raw time series, all long-term patterns are assumed to be
in the seasonal component $s_t$, and $a_t$ only contains the level with
a potential trend and auto-correlations.
The models in this family are:
\begin{enumerate}
\item \textit{\gls{fnaive}},
\textit{\gls{pnaive}}:
Sum of STL's trend and seasonal components' na\"{i}ve forecasts
\item \textit{\gls{vholt}},
\textit{\gls{vses}}, and
\textit{\gls{vtheta}}:
Exponential smoothing without calibration and seasonal
fit
\item \textit{\gls{vets}}:
ETS calibrated as described by \cite{hyndman2008b}
\item \textit{\gls{varima}}:
ARIMA calibrated as described by \cite{hyndman2008a}
\end{enumerate}
As mentioned in Sub-section \ref{unified_cv}, we include the sum of the
(seasonal) na\"{i}ve forecasts of the STL's trend and seasonal components
as forecasts on their own:
For \textit{fnaive}, we tune the "flexible" $ns$ parameter, and for
\textit{pnaive}, we set it to a "periodic" value.
Thus, we implicitly assume that there is no signal in the remainder $r_t$, and
predict $0$ for it.
\textit{fnaive} and \textit{pnaive} are two more simple benchmarks.

View file

@ -0,0 +1,22 @@
\subsubsection{Vertical and Real-time Forecasts with Retraining.}
\label{rt}
The lower-left in Figure \ref{f:inputs} shows how models trained on vertical
time series are extended with real-time order data as it becomes available
during a test day:
Instead of obtaining an $H$-step-ahead forecast, we retrain a model after
every time step and only predict one step.
The remainder is as in the previous sub-section, and the models are:
\begin{enumerate}
\item \textit{\gls{rtholt}},
\textit{\gls{rtses}}, and
\textit{\gls{rttheta}}:
Exponential smoothing without calibration and seasonal fit
\item \textit{\gls{rtets}}:
ETS calibrated as described by \cite{hyndman2008b}
\item \textit{\gls{rtarima}}:
ARIMA calibrated as described by \cite{hyndman2008a}
\end{enumerate}
Retraining \textit{fnaive} and \textit{pnaive} did not increase accuracy, and
thus we left them out.
A downside of this family is the significant increase in computing costs.

View file

@ -0,0 +1,54 @@
\subsubsection{Vertical and Real-time Forecasts without Retraining.}
\label{ml_models}
The lower-right in Figure \ref{f:inputs} shows how ML models take
real-time order data into account without retraining.
Based on the seasonally-adjusted time series $a_t$, we employ the feature
matrix and label vector representations from Sub-section \ref{learning}
and set $n$ to the number of daily time steps, $H$, to cover all potential
auto-correlations.
The ML models are trained once before a test day starts.
For training, the matrix and vector are populated such that $y_T$ is set to
the last time step of the day before the forecasts, $a_T$.
As the splitting during CV is done with whole days, the \gls{ml} models are
trained with training sets consisting of samples from all times of a day
in an equal manner.
Thus, the ML models learn to predict each time of the day.
For prediction on a test day, the $H$ observations preceding the time
step to be forecast are used as the input vector after seasonal
adjustment.
As a result, real-time data are included.
The models in this family are:
\begin{enumerate}
\item \textit{\gls{vrfr}}: RF trained on the matrix as described
\item \textit{\gls{vsvr}}: SVR trained on the matrix as described
\end{enumerate}
We tried other ML models such as gradient boosting machines but found
only RFs and SVRs to perform well in our study.
In the case of gradient boosting machines, this is to be expected as they are
known not to perform well in the presence of high noise - as is natural
with low count data - as shown, for example, by \cite{ma2018} or
\cite{mason2000}.
Also, deep learning methods are not applicable as the feature matrices only
consist of several hundred to thousands of rows (cf., Sub-section
\ref{params}).
In \ref{tabular_ml_models}, we provide an alternative feature matrix
representation that exploits the two-dimensional structure of time tables
without decomposing the time series.
In \ref{enhanced_feats}, we show how feature matrices are extended
to include predictors other than historical order data.
However, to answer \textbf{Q5} already here, none of the external data sources
improves the results in our study.
Due to the high number of time series in our study, to investigate why
no external sources improve the forecasts, we must us some automated
approach to analyzing individual time series.
\cite{barbour2014} provide a spectral density estimation approach, called
the Shannon entropy, that measures the signal-to-noise ratio in a
database with a number normalized between 0 and 1 where lower values
indicate a higher signal-to-noise ratio.
We then looked at averages of the estimates on a daily level per pixel and
find that including any of the external data sources from
\ref{enhanced_feats} always leads to significantly lower signal-to-noise
ratios.
Thus, we conclude that at least for the demand faced by our industry partner
the historical data contains all of the signal.

View file

@ -1,2 +1,5 @@
\section{Empirical Study: A Meal Delivery Platform in Europe}
\label{stu}
% temporary placeholder
\label{params}

54
tex/apx/case_study.tex Normal file
View file

@ -0,0 +1,54 @@
\section{Raw Order Data in the Case Study}
\label{dataset}
The raw data for the empirical study in Section \ref{stu} was provided by a
meal delivery platform operating in five cities in France in 2016.
The platform received a total of 686,385 orders distributed as follows:
\
\begin{center}
\begin{tabular}{llr}
\hline
\thead{City} & \thead{Launch Day} & \thead{Orders} \\
\hline
Bordeaux & July 18 & 64,012 \\
Lille & October 30 & 14,362 \\
Lyon & February 21 & 214,635 \\
Nantes & October 31 & 12,900 \\
Paris & March 7 & 380,476 \\
\end{tabular}
\end{center}
\
The part of the database relevant for forecasting can be thought of as one
table per city, where each row represents one order and consists of the
following groups of columns:
\begin{enumerate}
\item \textbf{Restaurant Data}
\begin{enumerate}
\item unique ID and name
\item pickup location as latitude-longitude pair
\end{enumerate}
\item \textbf{Customer Data}
\begin{enumerate}
\item unique ID, name, and phone number
\item delivery location as latitude-longitude pair (mostly physical
addresses but also public spots)
\end{enumerate}
\item \textbf{Timestamps}
\begin{enumerate}
\item placement via the smartphone app
\item fulfillment workflow (pickup, delivery, cancellation, re-deliveries)
\end{enumerate}
\item \textbf{Courier Data}
\begin{enumerate}
\item unique ID, name, and phone number
\item shift data (begin, breaks, end)
\item average speed
\end{enumerate}
\item \textbf{Order Details}
\begin{enumerate}
\item meals and drinks
\item prices and discounts granted
\end{enumerate}
\end{enumerate}

121
tex/apx/enhanced_feats.tex Normal file
View file

@ -0,0 +1,121 @@
\section{Enhancing Forecasting Models with External Data}
\label{enhanced_feats}
In this appendix, we show how the feature matrix in Sub-section
\ref{ml_models} can be extended with features other than historical order
data.
Then, we provide an overview of what external data we tried out as predictors
in our empirical study.
\subsection{Enhanced Feature Matrices}
Feature matrices can naturally be extended by appending new feature columns
$x_{t,f}$ or $x_f$ on the right where the former represent predictors
changing throughout a day and the latter being static either within a
pixel or across a city.
$f$ refers to an external predictor variable, such as one of the examples
listed below.
In the SVR case, the columns should be standardized before fitting as external
predictors are most likely on a different scale than the historic order
data.
Thus, for a matrix with seasonally-adjusted order data $a_t$ in it, an
enhanced matrix looks as follows:
$$
\vec{y}
=
\begin{pmatrix}
a_T \\
a_{T-1} \\
\dots \\
a_{H+1}
\end{pmatrix}
~~~~~
\mat{X}
=
\begin{bmatrix}
a_{T-1} & a_{T-2} & \dots & a_{T-H} & ~~~
& x_{T,A} & \dots & x_{B} & \dots \\
a_{T-2} & a_{T-3} & \dots & a_{T-(H+1)} & ~~~
& x_{T-1,A} & \dots & x_{B} & \dots \\
\dots & \dots & \dots & \dots & ~~~
& \dots & \dots & \dots & \dots \\
a_H & a_{H-1} & \dots & a_1 & ~~~
& x_{H+1,A} & \dots & x_{B} & \dots
\end{bmatrix}
$$
\
Similarly, we can also enhance the tabular matrices from
\ref{tabular_ml_models}.
The same comments as for their pure equivalents in Sub-section \ref{ml_models}
apply, in particular, that ML models trained with an enhanced matrix can
process real-time data without being retrained.
\subsection{External Data in the Empirical Study}
\label{external_data}
In the empirical study, we tested four groups of external features that we
briefly describe here.
\vskip 0.1in
\textbf{Calendar Features}:
\begin{itemize}
\item Time of day (as synthesized integers: e.g., 1,050 for 10:30 am,
or 1,600 for 4 pm)
\item Day of week (as one-hot encoded booleans)
\item Work day or not (as booleans)
\end{itemize}
\vskip 0.1in
\textbf{Features derived from the historical Order Data}:
\begin{itemize}
\item Number of pre-orders for a time step (as integers)
\item 7-day SMA of the percentages of discounted orders (as percentages):
The platform is known for running marketing campaigns aimed at
first-time customers at irregular intervals. Consequently, the
order data show a wave-like pattern of coupons redeemed when looking
at the relative share of discounted orders per day.
\end{itemize}
\vskip 0.1in
\textbf{Neighborhood Features}:
\begin{itemize}
\item Ambient population (as integers) as obtained from the ORNL LandScan
database
\item Number of active platform restaurants (as integers)
\item Number of overall restaurants, food outlets, retailers, and other
businesses (as integers) as obtained from the Google Maps and Yelp
web services
\end{itemize}
\vskip 0.1in
\textbf{Real-time Weather} (raw data obtained from IBM's
Wunderground database):
\begin{itemize}
\item Absolute temperature, wind speed, and humidity
(as decimals and percentages)
\item Relative temperature with respect to 3-day and 7-day historical
means (as decimals)
\item Day vs. night defined by sunset (as booleans)
\item Summarized description (as indicators $-1$, $0$, and $+1$)
\item Lags of the absolute temperature and the summaries covering the
previous three hours
\end{itemize}
\vskip 0.1in
Unfortunately, we must report that none of the mentioned external data
improved the accuracy of the forecasts.
Some led to models overfitting the data, which could not be regulated.
Manual tests revealed that real-time weather data are the most promising
external source.
Nevertheless, the data provided by IBM's Wunderground database originate from
weather stations close to airports, which implies that we only have the
same aggregate weather data for the entire city.
If weather data is available on a more granular basis in the future, we see
some potential for exploitation.

261
tex/apx/peak_results.tex Normal file
View file

@ -0,0 +1,261 @@
\section{Forecasting Accuracies during Peak Times}
\label{peak_results}
This appendix shows all result tables from the main text with the MASE
averages calculated from time steps within peak times.
Peaks are the times of the day where the typical customer has a lunch or
dinner meal and defined to be either from 12 pm to 2 pm or from 6 pm to
8 pm.
While the exact decimals of the MASEs differ from the ones in the main
text, the relative ranks of the forecasting methods are the same except in
rare cases.
\begin{center}
\captionof{table}{Top-3 models by training weeks and average demand
($1~\text{km}^2$ pixel size, 60-minute time steps)}
\label{t:results:a}
\begin{tabular}{|c|c|*{12}{c|}}
\hline
\multirow{3}{*}{\rotatebox{90}{\thead{Training}}}
& \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}}
& \multicolumn{3}{c|}{\thead{No Demand}}
& \multicolumn{3}{c|}{\thead{Low Demand}}
& \multicolumn{3}{c|}{\thead{Medium Demand}}
& \multicolumn{3}{c|}{\thead{High Demand}} \\
~ & ~
& \multicolumn{3}{c|}{(0 - 2.5)}
& \multicolumn{3}{c|}{(2.5 - 10)}
& \multicolumn{3}{c|}{(10 - 25)}
& \multicolumn{3}{c|}{(25 - $\infty$)} \\
\cline{3-14}
~ & ~
& Method & MASE & $n$
& Method & MASE & $n$
& Method & MASE & $n$
& Method & MASE & $n$ \\
\hline \hline
\multirow{3}{*}{3} & 1
& \textbf{\textit{trivial}}
& 0.794 & \multirow{3}{*}{\rotatebox{90}{4586}}
& \textbf{\textit{hsma}}
& 0.817 & \multirow{3}{*}{\rotatebox{90}{2975}}
& \textbf{\textit{hsma}}
& 0.838 & \multirow{3}{*}{\rotatebox{90}{2743}}
& \textbf{\textit{rtarima}}
& 0.871 & \multirow{3}{*}{\rotatebox{90}{2018}} \\
~ & 2
& \textit{hsma} & 0.808 & ~
& \textit{hses} & 0.847 & ~
& \textit{hses} & 0.851 & ~
& \textit{rtses} & 0.872 & ~ \\
~ & 3
& \textit{pnaive} & 0.938 & ~
& \textit{hets} & 0.848 & ~
& \textit{hets} & 0.853 & ~
& \textit{rtets} & 0.874 & ~ \\
\hline
\multirow{3}{*}{4} & 1
& \textbf{\textit{trivial}}
& 0.791 & \multirow{3}{*}{\rotatebox{90}{4532}}
& \textbf{\textit{hsma}}
& 0.833 & \multirow{3}{*}{\rotatebox{90}{3033}}
& \textbf{\textit{hsma}}
& 0.839 & \multirow{3}{*}{\rotatebox{90}{2687}}
& \textbf{\textit{vrfr}}
& 0.848 & \multirow{3}{*}{\rotatebox{90}{2016}} \\
~ & 2
& \textit{hsma} & 0.794 & ~
& \textit{hses} & 0.838 & ~
& \textit{hses} & 0.847 & ~
& \textbf{\textit{rtarima}} & 0.851 & ~ \\
~ & 3
& \textit{pnaive} & 0.907 & ~
& \textit{hets} & 0.841 & ~
& \textit{hets} & 0.851 & ~
& \textit{rtses} & 0.857 & ~ \\
\hline
\multirow{3}{*}{5} & 1
& \textbf{\textit{trivial}}
& 0.782 & \multirow{3}{*}{\rotatebox{90}{4527}}
& \textbf{\textit{hsma}}
& 0.844 & \multirow{3}{*}{\rotatebox{90}{3055}}
& \textbf{\textit{hsma}}
& 0.841 & \multirow{3}{*}{\rotatebox{90}{2662}}
& \textbf{\textit{vrfr}}
& 0.849 & \multirow{3}{*}{\rotatebox{90}{2019}} \\
~ & 2
& \textit{hsma} & 0.802 & ~
& \textit{hses} & 0.851 & ~
& \textit{hets} & 0.844 & ~
& \textbf{\textit{rtarima}} & 0.851 & ~ \\
~ & 3
& \textit{pnaive} & 0.888 & ~
& \textit{hets} & 0.863 & ~
& \textit{hses} & 0.845 & ~
& \textit{vsvr} & 0.853 & ~ \\
\hline
\multirow{3}{*}{6} & 1
& \textbf{\textit{trivial}}
& 0.743 & \multirow{3}{*}{\rotatebox{90}{4470}}
& \textbf{\textit{hsma}}
& 0.843 & \multirow{3}{*}{\rotatebox{90}{3086}}
& \textbf{\textit{hsma}}
& 0.841 & \multirow{3}{*}{\rotatebox{90}{2625}}
& \textbf{\textit{vrfr}}
& 0.844 & \multirow{3}{*}{\rotatebox{90}{2025}} \\
~ & 2
& \textit{hsma} & 0.765 & ~
& \textit{hses} & 0.853 & ~
& \textit{hses} & 0.844 & ~
& \textbf{\textit{hets}} & 0.847 & ~ \\
~ & 3
& \textit{pnaive} & 0.836 & ~
& \textit{hets} & 0.861 & ~
& \textit{hets} & 0.844 & ~
& \textit{vsvr} & 0.849 & ~ \\
\hline
\multirow{3}{*}{7} & 1
& \textbf{\textit{trivial}}
& 0.728 & \multirow{3}{*}{\rotatebox{90}{4454}}
& \textbf{\textit{hsma}}
& 0.855 & \multirow{3}{*}{\rotatebox{90}{3132}}
& \textbf{\textit{hets}}
& 0.843 & \multirow{3}{*}{\rotatebox{90}{2597}}
& \textbf{\textit{hets}}
& 0.839 & \multirow{3}{*}{\rotatebox{90}{2007}} \\
~ & 2
& \textit{hsma} & 0.744 & ~
& \textit{hses} & 0.862 & ~
& \textit{hsma} & 0.845 & ~
& \textbf{\textit{vrfr}} & 0.842 & ~ \\
~ & 3
& \textit{pnaive} & 0.812 & ~
& \textit{hets} & 0.868 & ~
& \textbf{\textit{vsvr}} & 0.849 & ~
& \textit{vsvr} & 0.846 & ~ \\
\hline
\multirow{3}{*}{8} & 1
& \textbf{\textit{trivial}}
& 0.736 & \multirow{3}{*}{\rotatebox{90}{4402}}
& \textbf{\textit{hsma}}
& 0.865 & \multirow{3}{*}{\rotatebox{90}{3159}}
& \textbf{\textit{hets}}
& 0.843 & \multirow{3}{*}{\rotatebox{90}{2575}}
& \textbf{\textit{hets}}
& 0.837 & \multirow{3}{*}{\rotatebox{90}{2002}} \\
~ & 2
& \textit{hsma} & 0.759 & ~
& \textit{hets} & 0.874 & ~
& \textbf{\textit{vsvr}} & 0.848 & ~
& \textbf{\textit{vrfr}} & 0.841 & ~ \\
~ & 3
& \textit{pnaive} & 0.820 & ~
& \textit{hses} & 0.879 & ~
& \textit{hsma} & 0.850 & ~
& \textit{vsvr} & 0.847 & ~ \\
\hline
\end{tabular}
\end{center}
\begin{center}
\captionof{table}{Ranking of benchmark and horizontal models
($1~\text{km}^2$ pixel size, 60-minute time steps):
the table shows the ranks for cases with $2.5 < ADD < 25$
(and $25 < ADD < \infty$ in parentheses if they differ)}
\label{t:hori:a}
\begin{tabular}{|c|ccc|cccccccc|}
\hline
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
& \multicolumn{3}{c|}{\thead{Benchmarks}}
& \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\
\cline{2-12}
~ & \textit{naive} & \textit{fnaive} & \textit{paive}
& \textit{harima} & \textit{hcroston} & \textit{hets} & \textit{hholt}
& \textit{hhwinters} & \textit{hses} & \textit{hsma} & \textit{htheta} \\
\hline \hline
3 & 11 & 7 (2) & 8 (5) & 5 (7) & 4 & 3
& 9 (10) & 10 (9) & 2 (6) & 1 & 6 (8) \\
4 & 11 & 7 (2) & 8 (3) & 5 (6) & 4 (5) & 3 (1)
& 9 (10) & 10 (9) & 2 (8) & 1 (4) & 6 (7) \\
5 & 11 & 7 (2) & 8 (4) & 5 (3) & 4 (9) & 3 (1)
& 9 (10) & 10 (5) & 2 (8) & 1 (6) & 6 (7) \\
6 & 11 & 8 (5) & 9 (6) & 5 (4) & 4 (7) & 2 (1)
& 10 & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
7 & 11 & 8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
& 9 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
8 & 11 & 9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
& 8 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
\hline
\end{tabular}
\end{center}
\
\begin{center}
\captionof{table}{Ranking of classical models on vertical time series
($1~\text{km}^2$ pixel size, 60-minute time steps):
the table shows the ranks for cases with $2.5 < ADD < 25$
(and $25 < ADD < \infty$ in parentheses if they differ)}
\label{t:vert:a}
\begin{tabular}{|c|cc|ccccc|ccccc|}
\hline
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
& \multicolumn{2}{c|}{\thead{Benchmarks}}
& \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}}
& \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\
\cline{2-13}
~ & \textit{hets} & \textit{hsma} & \textit{varima} & \textit{vets}
& \textit{vholt} & \textit{vses} & \textit{vtheta} & \textit{rtarima}
& \textit{rtets} & \textit{rtholt} & \textit{rtses} & \textit{rttheta} \\
\hline \hline
3 & 2 (10) & 1 (7) & 6 (4) & 8 (6) & 10 (9)
& 7 (5) & 11 (12) & 4 (1) & 5 (3) & 9 (8) & 3 (2) & 12 (11) \\
4 & 2 (7) & 1 (10) & 6 (4) & 8 (6) & 10 (9)
& 7 (5) & 12 (11) & 3 (1) & 5 (3) & 9 (8) & 4 (2) & 11 (12) \\
5 & 2 (3) & 1 (10) & 7 (5) & 8 (7) & 10 (9)
& 6 & 11 & 4 (1) & 5 (4) & 9 (8) & 3 (2) & 12 \\
6 & 2 (1) & 1 (10) & 6 (5) & 8 (7) & 10 (9)
& 7 (6) & 11 (12) & 3 (2) & 5 (4) & 9 (8) & 4 (3) & 12 (11) \\
7 & 2 (1) & 1 (10) & 8 (5) & 7 & 10 (9)
& 6 & 11 (12) & 5 (2) & 4 & 9 (8) & 3 & 12 (11) \\
8 & 2 (1) & 1 (9) & 8 (5) & 7 & 10 (8)
& 6 & 12 (10) & 5 (2) & 4 & 9 (6) & 3 & 11 \\
\hline
\end{tabular}
\end{center}
\
\pagebreak
\begin{center}
\captionof{table}{Ranking of ML models on vertical time series
($1~\text{km}^2$ pixel size, 60-minute time steps):
the table shows the ranks for cases with $2.5 < ADD < 25$
(and $25 < ADD < \infty$ in parentheses if they differ)}
\label{t:ml:a}
\begin{tabular}{|c|cccc|cc|}
\hline
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
& \multicolumn{4}{c|}{\thead{Benchmarks}}
& \multicolumn{2}{c|}{\thead{ML}} \\
\cline{2-7}
~ & \textit{fnaive} & \textit{hets} & \textit{hsma}
& \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\
\hline \hline
3 & 6 & 2 (5) & 1 (3) & 3 (1) & 5 (2) & 4 \\
4 & 6 (5) & 2 (3) & 1 (6) & 3 (2) & 5 (1) & 4 \\
5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\
6 & 6 (5) & 2 & 1 (6) & 4 & 5 (1) & 3 \\
7 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
8 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
\hline
\end{tabular}
\end{center}
\

View file

@ -0,0 +1,58 @@
\section{Tabular and Real-time Forecasts without Retraining}
\label{tabular_ml_models}
Regarding the structure of the feature matrix for the ML models in Sub-section
\ref{ml_models}, we provide an alternative approach that works without
the STL method.
Instead of decomposing a time series and arranging the resulting
seasonally-adjusted time series $a_t$ into a matrix $\mat{X}$, one can
create a matrix with two types of feature columns mapped to the raw
observations in $\vec{y}$:
While the first group of columns takes all observations of the same time of
day over a horizon of, for example, one week ($n_h=7$), the second group
takes all observations covering a pre-defined time horizon, for example
$3$ hours ($n_r=3$ for 60-minute time steps), preceding the time step to
be fitted.
Thus, we exploit the two-dimensional structure of time tables as well, and
conceptually model historical and recent demand.
The alternative feature matrix appears as follows where the first three
columns are the historical and the last three the recent demand features:
$$
\vec{y}
=
\begin{pmatrix}
y_T \\
y_{T-1} \\
\dots \\
y_{1+n_hH}
\end{pmatrix}
~~~~~
\mat{X}
=
\begin{bmatrix}
y_{T-H} & y_{T-2H} & \dots & y_{T-n_hH}
& y_{T-1} & y_{T-2} & \dots & y_{T-n_r} \\
y_{T-1-H} & y_{T-1-2H} & \dots & y_{T-1-n_hH}
& y_{T-2} & y_{T-3} & \dots & y_{T-n_r-1} \\
\dots & \dots & \dots & \dots
& \dots & \dots & \dots & \dots \\
y_{1+(n_h-1)H} & y_{1+(n_h-2)H} & \dots & y_1
& y^*_{1+n_hH-1} & y^*_{1+n_hH-2} & \dots & y^*_{1+n_hH-n_r}
\end{bmatrix}
$$
\
Being a detail, we note that the recent demand features lying on the end of
the previous day are set to $0$, which is shown with the $^*$ notation
above.
This alignment of the undecomposed order data $y_t$ ensures that the ML
models learn the two seasonal patterns independently.
The parameters $n_h$ and $n_r$ must be adapted to the data, but we found the
above values to work well.
As such matrices resemble time tables, we refer to them as tabular.
However, we found the ML models with vertical time series to outperform the
tabular ML models, which is why we disregarded them in the study.
This tabular form could be beneficial for UDPs with a demand that exhibits
a weaker seasonality such as a meal delivery platform.

View file

@ -5,6 +5,9 @@
\newglossaryentry{cv}{
name=CV, description={Cross Validation}
}
\newglossaryentry{mase}{
name=MASE, description={Mean Absolute Scaled Error}
}
\newglossaryentry{ml}{
name=ML, description={Machine Learning}
}
@ -27,4 +30,103 @@
name=VRP, description={Vehicle Routing Problem}
}
% Model names.
\newglossaryentry{naive}{
name=naive, description={(Seasonal) Na\"{i}ve Method}
}
\newglossaryentry{fnaive}{
name=fnaive, description={"Flexible" STL Decomposition,
with tuned ns parameter}
}
\newglossaryentry{pnaive}{
name=pnaive, description={"Periodic" STL Decomposition,
with ns parameter set to large number}
}
\newglossaryentry{trivial}{
name=trivial, description={Trivial Method}
}
\newglossaryentry{hcroston}{
name=hcroston, description={Croston's Method,
trained on horizontal time series}
}
\newglossaryentry{hholt}{
name=hholt, description={Holt's Linear Trend Method,
trained on horizontal time series}
}
\newglossaryentry{vholt}{
name=vholt, description={Holt's Linear Trend Method,
trained on vertical time series}
}
\newglossaryentry{rtholt}{
name=rtholt, description={Holt's Linear Trend Method,
(re)trained on vertical time series}
}
\newglossaryentry{hhwinters}{
name=hhwinters, description={Holt-Winter's Seasonal Method,
trained on horizontal time series}
}
\newglossaryentry{hses}{
name=hses, description={Simple Exponential Smoothing Method,
trained on horizontal time series}
}
\newglossaryentry{vses}{
name=vses, description={Simple Exponential Smoothing Method,
trained on vertical time series}
}
\newglossaryentry{rtses}{
name=rtses, description={Simple Exponential Smoothing Method,
(re)trained on vertical time series}
}
\newglossaryentry{hsma}{
name=hsma, description={Simple Moving Average Method,
trained on horizontal time series}
}
\newglossaryentry{htheta}{
name=htheta, description={Theta Method,
trained on horizontal time series}
}
\newglossaryentry{vtheta}{
name=vtheta, description={Theta Method,
trained on vertical time series}
}
\newglossaryentry{rttheta}{
name=rttheta, description={Theta Method,
(re)trained on vertical time series}
}
\newglossaryentry{hets}{
name=hets, description={ETS State Space Method,
trained on horizontal time series}
}
\newglossaryentry{vets}{
name=vets, description={ETS State Space Method,
trained on vertical time series}
}
\newglossaryentry{rtets}{
name=rtets, description={ETS State Space Method,
(re)trained on vertical time series}
}
\newglossaryentry{harima}{
name=harima, description={Autoregressive Integrated Moving Average
Method,
trained on horizontal time series}
}
\newglossaryentry{varima}{
name=varima, description={Autoregressive Integrated Moving Average
Method,
trained on vertical time series}
}
\newglossaryentry{rtarima}{
name=rtarima, description={Autoregressive Integrated Moving Average
Method,
(re)trained on vertical time series}
}
\newglossaryentry{vrfr}{
name=vrfr, description={Random Forest Regression Method,
trained on vertical time series}
}
\newglossaryentry{vsvr}{
name=vsvr, description={Support Vector Regression Method,
trained on vertical time series}
}
\printglossaries

View file

@ -4,6 +4,12 @@
\usepackage[acronym]{glossaries}
\makeglossaries
% Enable captions for figures and tables.
\usepackage{caption}
% Enable diagonal lines in tables.
\usepackage{static/slashbox}
% Make opening quotes look different than closing quotes.
\usepackage[english=american]{csquotes}
\MakeOuterQuote{"}

View file

@ -26,6 +26,16 @@ volume={1},
pages={461--466}
}
@article{barbour2014,
title={psd: Adaptive, sine multitaper power spectral density estimation for R},
author={Barbour, Andrew J and Parker, Robert L},
year={2014},
journal={Computers \& Geosciences},
volume={63},
pages={1--8},
publisher={Elsevier}
}
@misc{bell2018,
title = {Forecasting at Uber: An Introduction},
author={Bell, Franziska and Smyl, Slawek},
@ -139,6 +149,16 @@ number={1},
pages={3--73}
}
@article{croston1972,
title={Forecasting and Stock Control for intermittent Demands},
author={Croston, J Do},
year={1972},
journal={Journal of the Operational Research Society},
volume={23},
number={3},
pages={289--303}
}
@book{dagum2016,
title={Seasonal Adjustment Methods and Real Time Trend-Cycle Estimation},
author={Dagum, Estela and Bianconcini, Silvia},
@ -275,6 +295,17 @@ number={2},
pages={287--290}
}
@article{hyndman2006,
title={Another Look at Measures of Forecast Accuracy},
author={Hyndman, Rob and Koehler, Anne},
year={2006},
journal={International Journal of Forecasting},
volume={22},
number={4},
pages={679--688},
publisher={Elsevier}
}
@article{hyndman2008a,
title={Automatic Time Series Forecasting: The forecast package for R},
author={Hyndman, Rob and Khandakar, Yeasmin},
@ -298,6 +329,18 @@ year={2018},
publisher={OTexts}
}
@article{kim2016,
title={A new Metric of Absolute Percentage Error for Intermittent Demand
Forecasts},
author={Kim, Sungil and Kim, Heeyoung},
year={2016},
journal={International Journal of Forecasting},
volume={32},
number={3},
pages={669--679},
publisher={Elsevier}
}
@article{kwiatkowski1992,
title={Testing the null hypothesis of stationarity against the alternative of a
unit root: How sure are we that economic time series have a unit root?},
@ -319,6 +362,17 @@ howpublished = {\url{https://eng.uber.com/neural-networks/}},
note = {Accessed: 2020-10-01}
}
@article{ma2018,
title={Using the Gradient Boosting Decision Tree to Improve the Delineation of
Hourly Rain Areas during the Summer from Advanced Himawari Imager Data},
author={Ma, Liang and Zhang, Guoping and Lu, Er},
year={2018},
journal={Journal of Hydrometeorology},
volume={19},
number={5},
pages={761-776},
}
@article{masmoudi2018,
title={The dial-a-ride problem with electric vehicles and battery
swapping stations},
@ -330,6 +384,15 @@ volume={118},
pages={392--420}
}
@inproceedings{mason2000,
title={Boosting algorithms as gradient descent},
author={Mason, Llew and Baxter, Jonathan and Bartlett, Peter L
and Frean, Marcus R},
year={2000},
booktitle={Advances in neural information processing systems},
pages={512--518}
}
@inproceedings{mueller1997,
title={Predicting Time Series with Support Vector Machines},
author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar
@ -367,6 +430,18 @@ number={5},
pages={311--315}
}
@article{prestwich2014,
title={Mean-based Error Measures for Intermittent Demand Forecasting},
author={Prestwich, Steven and Rossi, Roberto and Tarim, Armagan
and Hnich, Brahim},
year={2014},
journal={International Journal of Production Research},
volume={52},
number={22},
pages={6782--6791},
publisher={Taylor \& Francis}
}
@incollection{scholkopf1998,
title={Fast Approximation of Support Vector Kernel Expansions, and an
Interpretation of Clustering as Approximation in Feature Spaces},
@ -378,6 +453,13 @@ publisher={Springer},
pages={125--132}
}
@book{singleton2017,
title={Urban Analytics},
author={Singleton, Alex David and Spielman, Seth and Folch, David},
year={2017},
publisher={Sage}
}
@article{smola2004,
title={A Tutorial on Support Vector Regression},
author={Smola, Alex and Sch{\"o}lkopf, Bernhard},
@ -398,6 +480,17 @@ pages={285--292},
publisher={MIT, Cambridge, MA, USA}
}
@article{syntetos2005,
title={The Accuracy of Intermittent Demand Estimates},
author={Syntetos, Aris and Boylan, John},
year={2005},
journal={International Journal of forecasting},
volume={21},
number={2},
pages={303--314},
publisher={Elsevier}
}
@article{taylor2003,
title={Exponential Smoothing with a Damped Multiplicative Trend},
author={Taylor, James},
@ -442,6 +535,18 @@ volume={118},
pages={496--512}
}
@article{winkenbach2015,
title={Enabling urban logistics services at La Poste through
multi-echelon location-routing},
author={Winkenbach, Matthias and Kleindorfer, Paul R and Spinler, Stefan},
year={2015},
journal={Transportation Science},
volume={50},
number={2},
pages={520--540},
publisher={INFORMS}
}
@article{winters1960,
title={Forecasting Sales by Exponentially Weighted Moving Averages},
author={Winters, Peter},