Merge branch 'release-0.1.0' into main
This commit is contained in:
commit
1184c54377
52 changed files with 6042 additions and 1 deletions
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
*.acn
|
||||||
|
*.acr
|
||||||
|
*.alg
|
||||||
|
*.aux
|
||||||
|
*.bbl
|
||||||
|
*.blg
|
||||||
|
*.glg
|
||||||
|
*.glo
|
||||||
|
*.gls
|
||||||
|
*.glsdefs
|
||||||
|
*.ist
|
||||||
|
*.log
|
||||||
|
*.spl
|
||||||
|
*.synctex.gz
|
|
@ -6,5 +6,5 @@ to generate the academic paper
|
||||||
based on the
|
based on the
|
||||||
[urban-meal-delivery](https://github.com/webartifex/urban-meal-delivery)
|
[urban-meal-delivery](https://github.com/webartifex/urban-meal-delivery)
|
||||||
research project.
|
research project.
|
||||||
The paper was submitted to and is currently under review by the journal
|
The paper is accepted for publication by the journal
|
||||||
[Transportation Research Part E: Logistics and Transportation Review](https://www.journals.elsevier.com/transportation-research-part-e-logistics-and-transportation-review).
|
[Transportation Research Part E: Logistics and Transportation Review](https://www.journals.elsevier.com/transportation-research-part-e-logistics-and-transportation-review).
|
||||||
|
|
BIN
paper.pdf
Normal file
BIN
paper.pdf
Normal file
Binary file not shown.
61
paper.tex
Normal file
61
paper.tex
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
\documentclass[preprint,review,12pt,authoryear]{static/elsarticle}
|
||||||
|
|
||||||
|
\input{tex/preamble}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\input{tex/meta}
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\input{tex/1_intro}
|
||||||
|
\input{tex/2_lit/1_intro}
|
||||||
|
\input{tex/2_lit/2_class/1_intro}
|
||||||
|
\input{tex/2_lit/2_class/2_ets}
|
||||||
|
\input{tex/2_lit/2_class/3_arima}
|
||||||
|
\input{tex/2_lit/2_class/4_stl}
|
||||||
|
\input{tex/2_lit/3_ml/1_intro}
|
||||||
|
\input{tex/2_lit/3_ml/2_learning}
|
||||||
|
\input{tex/2_lit/3_ml/3_cv}
|
||||||
|
\input{tex/2_lit/3_ml/4_rf}
|
||||||
|
\input{tex/2_lit/3_ml/5_svm}
|
||||||
|
\input{tex/3_mod/1_intro}
|
||||||
|
\input{tex/3_mod/2_overall}
|
||||||
|
\input{tex/3_mod/3_grid}
|
||||||
|
\input{tex/3_mod/4_cv}
|
||||||
|
\input{tex/3_mod/5_mase}
|
||||||
|
\input{tex/3_mod/6_decomp}
|
||||||
|
\input{tex/3_mod/7_models/1_intro}
|
||||||
|
\input{tex/3_mod/7_models/2_hori}
|
||||||
|
\input{tex/3_mod/7_models/3_vert}
|
||||||
|
\input{tex/3_mod/7_models/4_rt}
|
||||||
|
\input{tex/3_mod/7_models/5_ml}
|
||||||
|
\input{tex/4_stu/1_intro}
|
||||||
|
\input{tex/4_stu/2_data}
|
||||||
|
\input{tex/4_stu/3_params}
|
||||||
|
\input{tex/4_stu/4_overall}
|
||||||
|
\input{tex/4_stu/5_training}
|
||||||
|
\input{tex/4_stu/6_fams}
|
||||||
|
\input{tex/4_stu/7_pixels_intervals}
|
||||||
|
\input{tex/5_con/1_intro}
|
||||||
|
\input{tex/5_con/2_generalizability}
|
||||||
|
\input{tex/5_con/3_implications}
|
||||||
|
\input{tex/5_con/4_further_research}
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\appendix
|
||||||
|
\newpage
|
||||||
|
\input{tex/apx/tabular_ml_models}
|
||||||
|
\newpage
|
||||||
|
\input{tex/apx/enhanced_feats}
|
||||||
|
\newpage
|
||||||
|
\input{tex/apx/case_study}
|
||||||
|
\newpage
|
||||||
|
\input{tex/apx/peak_results}
|
||||||
|
\newpage
|
||||||
|
\input{tex/apx/glossary}
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\bibliographystyle{static/elsarticle-harv}
|
||||||
|
\bibliography{tex/references}
|
||||||
|
|
||||||
|
\end{document}
|
BIN
static/cross_validation_gray.png
Normal file
BIN
static/cross_validation_gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.5 KiB |
1597
static/elsarticle-harv.bst
Normal file
1597
static/elsarticle-harv.bst
Normal file
File diff suppressed because it is too large
Load diff
1056
static/elsarticle.cls
Normal file
1056
static/elsarticle.cls
Normal file
File diff suppressed because it is too large
Load diff
BIN
static/gridification_for_paris_gray.png
Normal file
BIN
static/gridification_for_paris_gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 261 KiB |
BIN
static/model_inputs_gray.png
Normal file
BIN
static/model_inputs_gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 84 KiB |
77
static/slashbox.sty
Normal file
77
static/slashbox.sty
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
% slashbox.sty by Koichi Yasuoka, May 27, 1993
|
||||||
|
% minor modification by Toru Sato, May 31, 1993
|
||||||
|
\typeout{slashbox style by K.Yasuoka, May 1993.}%
|
||||||
|
\newbox\@slashboxa
|
||||||
|
\newbox\@slashboxb
|
||||||
|
\newbox\@slashboxc
|
||||||
|
\newcount\@slashboxwd
|
||||||
|
\newcount\@slashboxht
|
||||||
|
\newdimen\@slashsepl
|
||||||
|
\newdimen\@slashsepr
|
||||||
|
\def\slashbox{%
|
||||||
|
\def\@slashboxpicture##1{%
|
||||||
|
\put(0,0){\line(##1,1){\@slashboxwd}}%
|
||||||
|
\put(0,\@slashboxht){\makebox(0,0)[tl]{\box\@slashboxa}}%
|
||||||
|
\put(\@slashboxwd,0){\makebox(0,0)[br]{\box\@slashboxb}}%
|
||||||
|
}%
|
||||||
|
\@slashbox
|
||||||
|
}%
|
||||||
|
\def\backslashbox{%
|
||||||
|
\def\@slashboxpicture##1{%
|
||||||
|
\put(0,\@slashboxht){\line(##1,-1){\@slashboxwd}}%
|
||||||
|
\put(0,0){\makebox(0,0)[bl]{\box\@slashboxa}}%
|
||||||
|
\put(\@slashboxwd,\@slashboxht){\makebox(0,0)[tr]{\box\@slashboxb}}%
|
||||||
|
}%
|
||||||
|
\@slashbox
|
||||||
|
}%
|
||||||
|
\def\@slashbox{\@ifnextchar [{\@@slashbox}{\@@slashbox[0pt]}}
|
||||||
|
\def\@@slashbox[#1]{\@ifnextchar [{\@@@slashbox[#1]}{\@@@slashbox[#1][c]}}
|
||||||
|
\def\@@@slashbox[#1][#2]#3#4{%
|
||||||
|
% #1: width, #2: suppression of \tabcolsep on `l', `r', or `lr' side
|
||||||
|
% #3: left item, #4: right item
|
||||||
|
\@slashsepl=\tabcolsep
|
||||||
|
\@slashsepr=\tabcolsep
|
||||||
|
\@tfor\@tempa :=#2\do{\expandafter\let
|
||||||
|
\csname @slashsep\@tempa\endcsname=\z@}%
|
||||||
|
\setbox\@slashboxa=\hbox{\strut\hskip\tabcolsep\shortstack[l]{#3}}%
|
||||||
|
\setbox\@slashboxb=\hbox{\shortstack[r]{#4}\hskip\tabcolsep\strut}%
|
||||||
|
\setbox\@slashboxa=\hbox{\raise\dp\@slashboxa\box\@slashboxa}%
|
||||||
|
\setbox\@slashboxb=\hbox{\raise\dp\@slashboxb\box\@slashboxb}%
|
||||||
|
\setbox\@slashboxc=\hbox{%
|
||||||
|
\@tempdima=\wd\@slashboxa
|
||||||
|
\advance\@tempdima by \wd\@slashboxb
|
||||||
|
\advance\@tempdima by \@slashsepl
|
||||||
|
\advance\@tempdima by \@slashsepr
|
||||||
|
\@tempdimb=#1\relax%
|
||||||
|
\ifdim\@tempdimb>\@tempdima \@tempdima=\@tempdimb\fi%
|
||||||
|
\@tempdimb=\ht\@slashboxa
|
||||||
|
\advance\@tempdimb by \dp\@slashboxa
|
||||||
|
\advance\@tempdimb by \ht\@slashboxb
|
||||||
|
\advance\@tempdimb by \dp\@slashboxb
|
||||||
|
\@tempcnta=\@tempdima
|
||||||
|
\@tempcntb=\@tempdimb
|
||||||
|
\advance\@tempcnta by \@tempcntb
|
||||||
|
\advance\@tempcnta by -1
|
||||||
|
\divide\@tempcnta by \@tempcntb
|
||||||
|
\ifnum\@tempcnta>6 \@tempcnta=6
|
||||||
|
\@tempdimb=0.166666666\@tempdima
|
||||||
|
\else
|
||||||
|
\ifnum\@tempcnta<1 \@tempcnta=1\fi
|
||||||
|
\@tempdima=\@tempdimb
|
||||||
|
\multiply\@tempdima by \@tempcnta
|
||||||
|
\fi%
|
||||||
|
\advance\@tempdima by -\@slashsepl
|
||||||
|
\advance\@tempdima by -\@slashsepr
|
||||||
|
\@slashboxwd=\@tempdima
|
||||||
|
\@slashboxht=\@tempdimb
|
||||||
|
\@tempcntb=\@slashsepl
|
||||||
|
\setlength{\unitlength}{1sp}%
|
||||||
|
\begin{picture}(\@slashboxwd,\@slashboxht)(\@tempcntb,0)
|
||||||
|
\advance\@tempdima by \@slashsepl
|
||||||
|
\advance\@tempdima by \@slashsepr
|
||||||
|
\@slashboxwd=\@tempdima
|
||||||
|
\@slashboxpicture{\@tempcnta}
|
||||||
|
\end{picture}%
|
||||||
|
}%
|
||||||
|
$\vcenter{\box\@slashboxc}$%
|
||||||
|
}%
|
BIN
static/stl_gray.png
Normal file
BIN
static/stl_gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 104 KiB |
103
tex/1_intro.tex
Normal file
103
tex/1_intro.tex
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
\section{Introduction}
|
||||||
|
\label{intro}
|
||||||
|
|
||||||
|
In recent years, many meal delivery platform providers (e.g., Uber Eats,
|
||||||
|
GrubHub, DoorDash, Deliveroo) with different kinds of business models have
|
||||||
|
entered the markets in cities around the world.
|
||||||
|
A study by \cite{hirschberg2016} estimates the global market size to surpass
|
||||||
|
20 billion Dollars by 2025.
|
||||||
|
A common feature of these platforms is that they do not operate kitchens but
|
||||||
|
focus on marketing their partner restaurants' meals, unifying all order
|
||||||
|
related processes in simple smartphone apps, and managing the delivery via
|
||||||
|
a fleet of either employees or crowd-sourced sub-contractors.
|
||||||
|
|
||||||
|
Various kinds of urban delivery platforms
|
||||||
|
(\gls{udp}; \ref{glossary} provides a glossary with all abbreviations)
|
||||||
|
have received attention in recent scholarly publications.
|
||||||
|
\cite{hou2018} look into heuristics to simultaneously optimize courier
|
||||||
|
scheduling and routing in general, while \cite{masmoudi2018} do so
|
||||||
|
for the popular dial-a-ride problem and \cite{wang2018} investigate
|
||||||
|
the effect of different fulfillment strategies in the context of urban
|
||||||
|
meal delivery.
|
||||||
|
\cite{ehmke2018} and \cite{alcaraz2019} focus their research on the routing
|
||||||
|
aspect, which is commonly modeled as a so-called vehicle routing problem
|
||||||
|
(\gls{vrp}).
|
||||||
|
|
||||||
|
Not covered in the recent literature is research focusing on the demand
|
||||||
|
forecasting problem a UDP faces.
|
||||||
|
Due to the customers' fragmented locations and the majority of the orders
|
||||||
|
occurring ad-hoc for immediate delivery in the case of a meal delivery
|
||||||
|
platform, forecasting demand for the near future (i.e., several hours)
|
||||||
|
and distinct locations of the city in real-time is an essential factor
|
||||||
|
in achieving timely fulfillment.
|
||||||
|
In general, demand forecasting is a well-researched discipline with a
|
||||||
|
decades-long history in scholarly journals as summarized, for example, by
|
||||||
|
\cite{de2006}.
|
||||||
|
Even some meal delivery platforms themselves publish their practices: For
|
||||||
|
example, \cite{bell2018} provide a general overview of supply and demand
|
||||||
|
forecasting at Uber and benchmarks of the methods used while
|
||||||
|
\cite{laptev2017} investigate how extreme events can be incorporated.
|
||||||
|
|
||||||
|
The conditions such platforms face are not limited to meal delivery:
|
||||||
|
Any entity that performs ad-hoc requested point-to-point transportation at
|
||||||
|
scale in an urban area benefits from a robust forecasting system.
|
||||||
|
Examples include ride-hailing, such as the original Uber offering, or bicycle
|
||||||
|
courier services.
|
||||||
|
The common characteristics are:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Geospatial Slicing}:
|
||||||
|
Forecasts for distinct parts of a city in parallel
|
||||||
|
\item \textbf{Temporal Slicing}:
|
||||||
|
Forecasts on a sub-daily basis (e.g., 60-minute windows)
|
||||||
|
\item \textbf{Order Sparsity}:
|
||||||
|
The historical order time series exhibit an intermittent pattern
|
||||||
|
\item \textbf{Double Seasonality}:
|
||||||
|
Demand varies with the day of the week and the time of day
|
||||||
|
\end{itemize}
|
||||||
|
Whereas the first two points can be assumed to vary with the concrete
|
||||||
|
application's requirements, it is the last two that pose challenges for
|
||||||
|
forecasting a platform's demand:
|
||||||
|
Intermittent demand (i.e., many observations in the historic order time series
|
||||||
|
exhibit no demand at all) renders most of the commonly applied error
|
||||||
|
metrics useless.
|
||||||
|
Moreover, many of the established forecasting methods can only handle a single
|
||||||
|
and often low seasonality (i.e., repeated regular pattern), if at all.
|
||||||
|
|
||||||
|
In this paper, we develop a rigorous methodology as to how to build and
|
||||||
|
evaluate a robust forecasting system for an UDP
|
||||||
|
that offers ad-hoc point-to-point transportation of any kind.
|
||||||
|
We implement such a system with a broad set of commonly used forecasting
|
||||||
|
methods.
|
||||||
|
We not only apply established (i.e., "classical") time series methods but also
|
||||||
|
machine learning (\gls{ml}) models that have gained traction in recent
|
||||||
|
years due to advancements in computing power and availability of larger
|
||||||
|
amounts of data.
|
||||||
|
In that regard, the classical methods serve as benchmarks for the ML methods.
|
||||||
|
Our system is trained on and evaluated with a dataset obtained from an
|
||||||
|
undisclosed industry partner that, during the timeframe of our study, was
|
||||||
|
active in several European countries and, in particular, in France.
|
||||||
|
Its primary business strategy is the delivery of meals from upper-class
|
||||||
|
restaurants to customers in their home or work places via bicycles.
|
||||||
|
In this empirical study, we identify the best-performing methods.
|
||||||
|
Thus, we answer the following research questions:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item[\textbf{Q1}:]
|
||||||
|
Which forecasting methods work best under what circumstances?
|
||||||
|
\item[\textbf{Q2}:]
|
||||||
|
How do classical forecasting methods compare with ML models?
|
||||||
|
\item[\textbf{Q3}:]
|
||||||
|
How does the forecast accuracy change with more historic data available?
|
||||||
|
\item[\textbf{Q4}:]
|
||||||
|
Can real-time information on demand be exploited?
|
||||||
|
\item[\textbf{Q5}:]
|
||||||
|
Can external data (e.g., weather data) improve the forecast accuracy?
|
||||||
|
\end{enumerate}
|
||||||
|
To the best of our knowledge, no such study has yet been published in a
|
||||||
|
scholarly journal.
|
||||||
|
|
||||||
|
The subsequent Section \ref{lit} reviews the literature on the forecasting
|
||||||
|
methods included in the system.
|
||||||
|
Section \ref{mod} introduces our forecasting system, and Section \ref{stu}
|
||||||
|
discusses the results obtained in the empirical study.
|
||||||
|
Lastly, Section \ref{con} summarizes our findings and concludes
|
||||||
|
with an outlook on further research opportunities.
|
17
tex/2_lit/1_intro.tex
Normal file
17
tex/2_lit/1_intro.tex
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
\section{Literature Review}
|
||||||
|
\label{lit}
|
||||||
|
|
||||||
|
In this section, we review the specific forecasting methods that make up our
|
||||||
|
forecasting system.
|
||||||
|
We group them into classical statistics and ML models.
|
||||||
|
The two groups differ mainly in how they represent the input data and how
|
||||||
|
accuracy is evaluated.
|
||||||
|
|
||||||
|
A time series is a finite and ordered sequence of equally spaced observations.
|
||||||
|
Thus, time is regarded as discrete and a time step as a short period.
|
||||||
|
Formally, a time series $Y$ is defined as $Y = \{y_t: t \in I\}$, or $y_t$ for
|
||||||
|
short, where $I$ is an index set of positive integers.
|
||||||
|
Besides its length $T = |Y|$, another property is the a priori fixed and
|
||||||
|
non-negative periodicity $k$ of a seasonal pattern in demand:
|
||||||
|
$k$ is the number of time steps after which a pattern repeats itself (e.g.,
|
||||||
|
$k=12$ for monthly sales data).
|
13
tex/2_lit/2_class/1_intro.tex
Normal file
13
tex/2_lit/2_class/1_intro.tex
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
\subsection{Demand Forecasting with Classical Forecasting Methods}
|
||||||
|
\label{class_methods}
|
||||||
|
|
||||||
|
Forecasting became a formal discipline starting in the 1950s and has its
|
||||||
|
origins in the broader field of statistics.
|
||||||
|
\cite{hyndman2018} provide a thorough overview of the concepts and methods
|
||||||
|
established, and \cite{ord2017} indicate business-related applications
|
||||||
|
such as demand forecasting.
|
||||||
|
These "classical" forecasting methods share the characteristic that they are
|
||||||
|
trained over the entire $Y$ first.
|
||||||
|
Then, for prediction, the forecaster specifies the number of time steps for
|
||||||
|
which he wants to generate forecasts.
|
||||||
|
That is different for ML models.
|
78
tex/2_lit/2_class/2_ets.tex
Normal file
78
tex/2_lit/2_class/2_ets.tex
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
\subsubsection{Na\"{i}ve Methods, Moving Averages, and Exponential Smoothing}
|
||||||
|
\label{ets}
|
||||||
|
|
||||||
|
Simple forecasting methods are often employed as a benchmark for more
|
||||||
|
sophisticated ones.
|
||||||
|
The so-called na\"{i}ve and seasonal na\"{i}ve methods forecast the next time
|
||||||
|
step in a time series, $y_{T+1}$, with the last observation, $y_T$,
|
||||||
|
and, if a seasonal pattern is present, with the observation $k$ steps
|
||||||
|
before, $y_{T+1-k}$.
|
||||||
|
As variants, both methods can be generalized to include drift terms in the
|
||||||
|
presence of a trend or changing seasonal amplitude.
|
||||||
|
|
||||||
|
If a time series exhibits no trend, a simple moving average (SMA) is a
|
||||||
|
generalization of the na\"{i}ve method that is more robust to outliers.
|
||||||
|
It is defined as follows: $\hat{y}_{T+1} = \frac{1}{h} \sum_{i=T-h}^{T} y_i$
|
||||||
|
where $h$ is the horizon over which the average is calculated.
|
||||||
|
If a time series exhibits a seasonal pattern, setting $h$ to a multiple of the
|
||||||
|
periodicity $k$ suffices that the forecast is unbiased.
|
||||||
|
|
||||||
|
Starting in the 1950s, another popular family of forecasting methods,
|
||||||
|
so-called exponential smoothing methods, was introduced by
|
||||||
|
\cite{brown1959}, \cite{holt1957}, and \cite{winters1960}.
|
||||||
|
The idea is that forecasts $\hat{y}_{T+1}$ are a weighted average of past
|
||||||
|
observations where the weights decay over time; in the case of the simple
|
||||||
|
exponential smoothing (SES) method we obtain:
|
||||||
|
$
|
||||||
|
\hat{y}_{T+1} = \alpha y_T + \alpha (1 - \alpha) y_{T-1}
|
||||||
|
+ \alpha (1 - \alpha)^2 y_{T-2}
|
||||||
|
+ \dots + \alpha (1 - \alpha)^{T-1} y_{1}
|
||||||
|
$
|
||||||
|
where $\alpha$ (with $0 \le \alpha \le 1$) is a smoothing parameter.
|
||||||
|
|
||||||
|
Exponential smoothing methods are often expressed in an alternative component
|
||||||
|
form that consists of a forecast equation and one or more smoothing
|
||||||
|
equations for unobservable components.
|
||||||
|
Below, we present a generalization of SES, the so-called Holt-Winters'
|
||||||
|
seasonal method, in an additive formulation.
|
||||||
|
$\ell_t$, $b_t$, and $s_t$ represent the unobservable level, trend, and
|
||||||
|
seasonal components inherent in $y_t$, and $\beta$ and $\gamma$ complement
|
||||||
|
$\alpha$ as smoothing parameters:
|
||||||
|
\begin{align*}
|
||||||
|
\hat{y}_{t+1} & = \ell_t + b_t + s_{t+1-k} \\
|
||||||
|
\ell_t & = \alpha(y_t - s_{t-k}) + (1 - \alpha)(\ell_{t-1} + b_{t-1}) \\
|
||||||
|
b_t & = \beta (\ell_{t} - \ell_{t-1}) + (1 - \beta) b_{t-1} \\
|
||||||
|
s_t & = \gamma (y_t - \ell_{t-1} - b_{t-1}) + (1-\gamma)s_{t-k}
|
||||||
|
\end{align*}
|
||||||
|
With $b_t$, $s_t$, $\beta$, and $\gamma$ removed, this formulation reduces to
|
||||||
|
SES.
|
||||||
|
Distinct variations exist: Besides the three components, \cite{gardner1985}
|
||||||
|
add dampening for the trend, \cite{pegels1969} provides multiplicative
|
||||||
|
formulations, and \cite{taylor2003} adds dampening to the latter.
|
||||||
|
The accuracy measure commonly employed is the sum of squared errors between
|
||||||
|
the observations and their forecasts.
|
||||||
|
|
||||||
|
Originally introduced by \cite{assimakopoulos2000}, \cite{hyndman2003} show
|
||||||
|
how the Theta method can be regarded as an equivalent to SES with a drift
|
||||||
|
term.
|
||||||
|
We mention this method here only because \cite{bell2018} emphasize that it
|
||||||
|
performs well at Uber.
|
||||||
|
However, in our empirical study, we find that this is not true in general.
|
||||||
|
|
||||||
|
\cite{hyndman2002} introduce statistical processes, so-called innovations
|
||||||
|
state-space models, to generalize the methods in this sub-section.
|
||||||
|
They call this family of models ETS as they capture error, trend, and seasonal
|
||||||
|
terms.
|
||||||
|
Linear and additive ETS models have a structure like so:
|
||||||
|
\begin{align*}
|
||||||
|
y_t & = \vec{w} \cdot \vec{x}_{t-1} + \epsilon_t \\
|
||||||
|
\vec{x_t} & = \mat{F} \vec{x}_{t-1} + \vec{g} \epsilon_t
|
||||||
|
\end{align*}
|
||||||
|
$y_t$ denote the observations as before while $\vec{x}_t$ is a state vector of
|
||||||
|
unobserved components.
|
||||||
|
$\epsilon_t$ is a white noise series and the matrix $\mat{F}$ and the vectors
|
||||||
|
$\vec{g}$ and $\vec{w}$ contain a model's coefficients.
|
||||||
|
Just as the models in the next sub-section, ETS models are commonly fitted
|
||||||
|
with maximum likelihood and evaluated using information theoretical
|
||||||
|
criteria against historical data.
|
||||||
|
We refer to \cite{hyndman2008b} for a thorough summary.
|
69
tex/2_lit/2_class/3_arima.tex
Normal file
69
tex/2_lit/2_class/3_arima.tex
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
\subsubsection{Autoregressive Integrated Moving Averages}
|
||||||
|
\label{arima}
|
||||||
|
|
||||||
|
\cite{box1962}, \cite{box1968}, and more papers by the same authors in the
|
||||||
|
1960s introduce a type of model where observations correlate with their
|
||||||
|
neighbors and refer to them as autoregressive integrated moving average
|
||||||
|
(ARIMA) models for stationary time series.
|
||||||
|
For a thorough overview, we refer to \cite{box2015} and \cite{brockwell2016}.
|
||||||
|
|
||||||
|
A time series $y_t$ is stationary if its moments are independent of the
|
||||||
|
point in time where it is observed.
|
||||||
|
A typical example is a white noise $\epsilon_t$ series.
|
||||||
|
Therefore, a trend or seasonality implies non-stationarity.
|
||||||
|
\cite{kwiatkowski1992} provide a test to check the null hypothesis of
|
||||||
|
stationary data.
|
||||||
|
To obtain a stationary time series, one chooses from several techniques:
|
||||||
|
First, to stabilize a changing variance (i.e., heteroscedasticity), one
|
||||||
|
applies a Box-Cox transformation (e.g., $log$) as first suggested by
|
||||||
|
\cite{box1964}.
|
||||||
|
Second, to factor out a trend (or seasonal) pattern, one computes differences
|
||||||
|
of consecutive (or of lag $k$) observations or even differences thereof.
|
||||||
|
Third, it is also common to pre-process $y_t$ with one of the decomposition
|
||||||
|
methods mentioned in Sub-section \ref{stl} below with an ARIMA model
|
||||||
|
then trained on an adjusted $y_t$.
|
||||||
|
|
||||||
|
In the autoregressive part, observations are modeled as linear combinations of
|
||||||
|
its predecessors.
|
||||||
|
Formally, an $AR(p)$ model is defined with a drift term $c$, coefficients
|
||||||
|
$\phi_i$ to be estimated (where $i$ is an index with $0 < i \leq p$), and
|
||||||
|
white noise $\epsilon_t$ like so:
|
||||||
|
$
|
||||||
|
AR(p): \ \
|
||||||
|
y_t = c + \phi_1 y_{t-1} + \phi_2 y_{t-2} + \dots + \phi_p y_{t-p}
|
||||||
|
+ \epsilon_t
|
||||||
|
$.
|
||||||
|
The moving average part considers observations to be regressing towards a
|
||||||
|
linear combination of past forecasting errors.
|
||||||
|
Formally, a $MA(q)$ model is defined with a drift term $c$, coefficients
|
||||||
|
$\theta_j$ to be estimated, and white noise terms $\epsilon_t$ (where $j$
|
||||||
|
is an index with $0 < j \leq q$) as follows:
|
||||||
|
$
|
||||||
|
MA(q): \ \
|
||||||
|
y_t = c + \epsilon_t + \theta_1 \epsilon_{t-1} + \theta_2 \epsilon_{t-2}
|
||||||
|
+ \dots + \theta_q \epsilon_{t-q}
|
||||||
|
$.
|
||||||
|
Finally, an $ARIMA(p,d,q)$ model unifies both parts and adds differencing
|
||||||
|
where $d$ is the degree of differences and the $'$ indicates differenced
|
||||||
|
values:
|
||||||
|
$
|
||||||
|
ARIMA(p,d,q): \ \
|
||||||
|
y'_t = c + \phi_1 y'_{t-1} + \dots + \phi_p y'_{t-p} + \theta_1 \epsilon_{t-1}
|
||||||
|
+ \dots + \theta_q \epsilon_{t-q} + \epsilon_{t}
|
||||||
|
$.
|
||||||
|
|
||||||
|
$ARIMA(p,d,q)$ models are commonly fitted with maximum likelihood estimation.
|
||||||
|
To find an optimal combination of the parameters $p$, $d$, and $q$, the
|
||||||
|
literature suggests calculating an information theoretical criterion
|
||||||
|
(e.g., Akaike's Information Criterion) that evaluates the fit on
|
||||||
|
historical data.
|
||||||
|
\cite{hyndman2008a} provide a step-wise heuristic to choose $p$, $d$, and $q$,
|
||||||
|
that also decides if a Box-Cox transformation is to be applied, and if so,
|
||||||
|
which one.
|
||||||
|
To obtain a one-step-ahead forecast, the above equation is reordered such
|
||||||
|
that $t$ is substituted with $T+1$.
|
||||||
|
For forecasts further into the future, the actual observations are
|
||||||
|
subsequently replaced by their forecasts.
|
||||||
|
Seasonal ARIMA variants exist; however, the high frequency $k$ in the kind of
|
||||||
|
demand a UDP faces typically renders them impractical as too many
|
||||||
|
coefficients must be estimated.
|
62
tex/2_lit/2_class/4_stl.tex
Normal file
62
tex/2_lit/2_class/4_stl.tex
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
\subsubsection{Seasonal and Trend Decomposition using Loess}
|
||||||
|
\label{stl}
|
||||||
|
|
||||||
|
A time series $y_t$ may exhibit different types of patterns; to fully capture
|
||||||
|
each of them, the series must be decomposed.
|
||||||
|
Then, each component is forecast with a distinct model.
|
||||||
|
Most commonly, the components are the trend $t_t$, seasonality $s_t$, and
|
||||||
|
remainder $r_t$.
|
||||||
|
They are themselves time series, where only $s_t$ exhibits a periodicity $k$.
|
||||||
|
A decomposition may be additive (i.e., $y_t = s_t + t_t + r_t$) or
|
||||||
|
multiplicative (i.e., $y_t = s_t * t_t * r_t$); the former assumes that
|
||||||
|
the effect of the seasonal component is independent of the overall level
|
||||||
|
of $y_t$ and vice versa.
|
||||||
|
The seasonal component is centered around $0$ in both cases such that its
|
||||||
|
removal does not affect the level of $y_t$.
|
||||||
|
Often, it is sufficient to only seasonally adjust the time series, and model
|
||||||
|
the trend and remainder together, for example, as $a_t = y_t - s_t$ in the
|
||||||
|
additive case.
|
||||||
|
|
||||||
|
Early approaches employed moving averages (cf., Sub-section \ref{ets}) to
|
||||||
|
calculate a trend component, and, after removing that from $y_t$, averaged
|
||||||
|
all observations of the same seasonal lag to obtain the seasonal
|
||||||
|
component.
|
||||||
|
The downsides of this are the subjectivity in choosing the window lengths for
|
||||||
|
the moving average and the seasonal averaging, the incapability of the
|
||||||
|
seasonal component to vary its amplitude over time, and the missing
|
||||||
|
handling of outliers.
|
||||||
|
|
||||||
|
The X11 method developed at the U.S. Census Bureau and described in detail by
|
||||||
|
\cite{dagum2016} overcomes these disadvantages.
|
||||||
|
However, due to its background in economics, it is designed primarily for
|
||||||
|
quarterly or monthly data, and the change in amplitude over time cannot be
|
||||||
|
controlled.
|
||||||
|
Variants of this method are the SEATS decomposition by the Bank of Spain and
|
||||||
|
the newer X13-SEATS-ARIMA method by the U.S. Census Bureau.
|
||||||
|
Their main advantages stem from the fact that the models calibrate themselves
|
||||||
|
according to statistical criteria without manual work for a statistician
|
||||||
|
and that the fitting process is robust to outliers.
|
||||||
|
|
||||||
|
\cite{cleveland1990} introduce a seasonal and trend decomposition using a
|
||||||
|
repeated locally weighted regression - the so-called Loess procedure - to
|
||||||
|
smoothen the trend and seasonal components, which can be viewed as a
|
||||||
|
generalization of the methods above and is denoted by the acronym
|
||||||
|
\gls{stl}.
|
||||||
|
In contrast to the X11, X13, and SEATS methods, the STL supports seasonalities
|
||||||
|
of any lag $k$ that must, however, be determined with additional
|
||||||
|
statistical tests or set with out-of-band knowledge by the forecaster
|
||||||
|
(e.g., hourly demand data implies $k = 24 * 7 = 168$ assuming customer
|
||||||
|
behavior differs on each day of the week).
|
||||||
|
Moreover, the seasonal component's rate of change, represented by the $ns$
|
||||||
|
parameter and explained in detail with Figure \ref{f:stl} in Section
|
||||||
|
\ref{decomp}, must be set by the forecaster as well, while the trend's
|
||||||
|
smoothness may be controlled via setting a non-default window size.
|
||||||
|
Outliers are handled by assignment to the remainder such that they do not
|
||||||
|
affect the trend and seasonal components.
|
||||||
|
In particular, the manual input needed to calibrate the STL explains why only
|
||||||
|
the X11, X13, and SEATS methods are widely used by practitioners.
|
||||||
|
However, the widespread adoption of concepts like cross-validation (cf.,
|
||||||
|
Sub-section \ref{cv}) in recent years enables the usage of an automated
|
||||||
|
grid search to optimize the parameters.
|
||||||
|
The STL's usage within a grid search is facilitated even further by its being
|
||||||
|
computationally cheaper than the other methods discussed.
|
15
tex/2_lit/3_ml/1_intro.tex
Normal file
15
tex/2_lit/3_ml/1_intro.tex
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
\subsection{Demand Forecasting with Machine Learning Methods}
|
||||||
|
\label{ml_methods}
|
||||||
|
|
||||||
|
ML methods have been employed in all kinds of prediction tasks in recent
|
||||||
|
years.
|
||||||
|
In this section, we restrict ourselves to the models that performed well in
|
||||||
|
our study: Random Forest (\gls{rf}) and Support Vector Regression
|
||||||
|
(\gls{svr}).
|
||||||
|
RFs are in general well-suited for datasets without a priori knowledge about
|
||||||
|
the patterns, while SVR is known to perform well on time series data, as
|
||||||
|
shown by \cite{hansen2006} in general and \cite{bao2004} specifically for
|
||||||
|
intermittent demand.
|
||||||
|
Gradient Boosting, another popular ML method, was consistently outperformed by
|
||||||
|
RFs, and artificial neural networks require an amount of data
|
||||||
|
exceeding what our industry partner has by far.
|
53
tex/2_lit/3_ml/2_learning.tex
Normal file
53
tex/2_lit/3_ml/2_learning.tex
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
\subsubsection{Supervised Learning}
|
||||||
|
\label{learning}
|
||||||
|
|
||||||
|
A conceptual difference between classical and ML methods is the format
|
||||||
|
for the model inputs.
|
||||||
|
In ML models, a time series $Y$ is interpreted as labeled data.
|
||||||
|
Labels are collected into a vector $\vec{y}$ while the corresponding
|
||||||
|
predictors are aligned in an $(T - n) \times n$ matrix $\mat{X}$:
|
||||||
|
$$
|
||||||
|
\vec{y}
|
||||||
|
=
|
||||||
|
\begin{pmatrix}
|
||||||
|
y_T \\
|
||||||
|
y_{T-1} \\
|
||||||
|
\dots \\
|
||||||
|
y_{n+1}
|
||||||
|
\end{pmatrix}
|
||||||
|
~~~~~~~~~~
|
||||||
|
\mat{X}
|
||||||
|
=
|
||||||
|
\begin{bmatrix}
|
||||||
|
y_{T-1} & y_{T-2} & \dots & y_{T-n} \\
|
||||||
|
y_{T-2} & y_{T-3} & \dots & y_{T-(n+1)} \\
|
||||||
|
\dots & \dots & \dots & \dots \\
|
||||||
|
y_n & y_{n-1} & \dots & y_1
|
||||||
|
\end{bmatrix}
|
||||||
|
$$
|
||||||
|
The $m = T - n$ rows are referred to as samples and the $n$ columns as
|
||||||
|
features.
|
||||||
|
Each row in $\mat{X}$ is "labeled" by the corresponding entry in $\vec{y}$,
|
||||||
|
and ML models are trained to fit the rows to their labels.
|
||||||
|
Conceptually, we model a functional relationship $f$ between $\mat{X}$ and
|
||||||
|
$\vec{y}$ such that the difference between the predicted
|
||||||
|
$\vec{\hat{y}} = f(\mat{X})$ and the true $\vec{y}$ are minimized
|
||||||
|
according to some error measure $L(\vec{\hat{y}}, \vec{y})$, where $L$
|
||||||
|
summarizes the goodness of the fit into a scalar value (e.g., the
|
||||||
|
well-known mean squared error [MSE]; cf., Section \ref{mase}).
|
||||||
|
$\mat{X}$ and $\vec{y}$ show the ordinal character of time series data:
|
||||||
|
Not only overlap the entries of $\mat{X}$ and $\vec{y}$, but the rows of
|
||||||
|
$\mat{X}$ are shifted versions of each other.
|
||||||
|
That does not hold for ML applications in general (e.g., the classical
|
||||||
|
example of predicting spam vs. no spam emails, where the features model
|
||||||
|
properties of individual emails), and most of the common error measures
|
||||||
|
presented in introductory texts on ML, are only applicable in cases
|
||||||
|
without such a structure in $\mat{X}$ and $\vec{y}$.
|
||||||
|
$n$, the number of past time steps required to predict a $y_t$, is an
|
||||||
|
exogenous model parameter.
|
||||||
|
For prediction, the forecaster supplies the trained ML model an input
|
||||||
|
vector in the same format as a row $\vec{x}_i$ in $\mat{X}$.
|
||||||
|
For example, to predict $y_{T+1}$, the model takes the vector
|
||||||
|
$(y_T, y_{T-1}, ..., y_{T-n+1})$ as input.
|
||||||
|
That is in contrast to the classical methods, where we only supply the number
|
||||||
|
of time steps to be predicted as a scalar integer.
|
38
tex/2_lit/3_ml/3_cv.tex
Normal file
38
tex/2_lit/3_ml/3_cv.tex
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
\subsubsection{Cross-Validation}
|
||||||
|
\label{cv}
|
||||||
|
|
||||||
|
Because ML models are trained by minimizing a loss function $L$, the
|
||||||
|
resulting value of $L$ underestimates the true error we see when
|
||||||
|
predicting into the actual future by design.
|
||||||
|
To counter that, one popular and model-agnostic approach is cross-validation
|
||||||
|
(\gls{cv}), as summarized, for example, by \cite{hastie2013}.
|
||||||
|
CV is a resampling technique, which ranomdly splits the samples into a
|
||||||
|
training and a test set.
|
||||||
|
Trained on the former, an ML model makes forecasts on the latter.
|
||||||
|
Then, the value of $L$ calculated only on the test set gives a realistic and
|
||||||
|
unbiased estimate of the true forecasting error, and may be used for one
|
||||||
|
of two distinct aspects:
|
||||||
|
First, it assesses the quality of a fit and provides an idea as to how the
|
||||||
|
model would perform in production when predicting into the actual future.
|
||||||
|
Second, the errors of models of either different methods or the same method
|
||||||
|
with different parameters may be compared with each other to select the
|
||||||
|
best model.
|
||||||
|
In order to first select the best model and then assess its quality, one must
|
||||||
|
apply two chained CVs:
|
||||||
|
The samples are divided into training, validation, and test sets, and all
|
||||||
|
models are trained on the training set and compared on the validation set.
|
||||||
|
Then, the winner is retrained on the union of the training and validation
|
||||||
|
sets and assessed on the test set.
|
||||||
|
|
||||||
|
Regarding the splitting, there are various approaches, and we choose the
|
||||||
|
so-called $k$-fold CV, where the samples are randomly divided into $k$
|
||||||
|
folds of the same size.
|
||||||
|
Each fold is used as a test set once and the remaining $k-1$ folds become
|
||||||
|
the corresponding training set.
|
||||||
|
The resulting $k$ error measures are averaged.
|
||||||
|
A $k$-fold CV with $k=5$ or $k=10$ is a compromise between the two extreme
|
||||||
|
cases of having only one split and the so-called leave-one-out CV
|
||||||
|
where $k = m$: Computation is still relatively fast and each sample is
|
||||||
|
part of several training sets maximizing the learning from the data.
|
||||||
|
We adapt the $k$-fold CV to the ordinal stucture in $\mat{X}$ and $\vec{y}$ in
|
||||||
|
Sub-section \ref{unified_cv}.
|
66
tex/2_lit/3_ml/4_rf.tex
Normal file
66
tex/2_lit/3_ml/4_rf.tex
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
\subsubsection{Random Forest Regression}
|
||||||
|
\label{rf}
|
||||||
|
|
||||||
|
\cite{breiman1984} introduce the classification and regression tree
|
||||||
|
(\gls{cart}) model that is built around the idea that a single binary
|
||||||
|
decision tree maps learned combinations of intervals of the feature
|
||||||
|
columns to a label.
|
||||||
|
Thus, each sample in the training set is associated with one leaf node that
|
||||||
|
is reached by following the tree from its root and branching along the
|
||||||
|
arcs according to some learned splitting rule per intermediate node that
|
||||||
|
compares the sample's realization for the feature specified by the rule to
|
||||||
|
the learned decision rule.
|
||||||
|
While such models are computationally fast and offer a high degree of
|
||||||
|
interpretability, they tend to overfit strongly to the training set as
|
||||||
|
the splitting rules are not limited to any functional form (e.g., linear)
|
||||||
|
in the relationship between the features and the labels.
|
||||||
|
In the regression case, it is common to maximize the variance reduction $I_V$
|
||||||
|
from a parent node $N$ to its two children, $C1$ and $C2$, as the
|
||||||
|
splitting rule.
|
||||||
|
\cite{breiman1984} formulate this as follows:
|
||||||
|
$$
|
||||||
|
I_V(N)
|
||||||
|
=
|
||||||
|
\frac{1}{|S_N|^2} \sum_{i \in S_N} \sum_{j \in S_N}
|
||||||
|
\frac{1}{2} (y_i - y_j)^2
|
||||||
|
- \left(
|
||||||
|
\frac{1}{|S_{C1}|^2} \sum_{i \in S_{C1}} \sum_{j \in S_{C1}}
|
||||||
|
\frac{1}{2} (y_i - y_j)^2
|
||||||
|
+
|
||||||
|
\frac{1}{|S_{C2}|^2} \sum_{i \in S_{C2}} \sum_{j \in S_{C2}}
|
||||||
|
\frac{1}{2} (y_i - y_j)^2
|
||||||
|
\right)
|
||||||
|
$$
|
||||||
|
$S_N$, $S_{C1}$, and $S_{C2}$ are the index sets of the samples in $N$, $C1$,
|
||||||
|
and $C2$.
|
||||||
|
|
||||||
|
\cite{ho1998} and then \cite{breiman2001} generalize this method by combining
|
||||||
|
many CART models into one forest of trees where every single tree is
|
||||||
|
a randomized variant of the others.
|
||||||
|
Randomization is achieved at two steps in the training process:
|
||||||
|
First, each tree receives a distinct training set resampled with replacement
|
||||||
|
from the original training set, an idea also called bootstrap
|
||||||
|
aggregation.
|
||||||
|
Second, at each node a random subset of the features is used to grow the tree.
|
||||||
|
Trees can be fitted in parallel speeding up the training significantly.
|
||||||
|
For prediction at the tree level, the average of all the samples at a
|
||||||
|
particular leaf node is used.
|
||||||
|
Then, the individual values are combined into one value by averaging again
|
||||||
|
across the trees.
|
||||||
|
Due to the randomization, the trees are decorrelated offsetting the
|
||||||
|
overfitting.
|
||||||
|
Another measure to counter overfitting is pruning the tree, either by
|
||||||
|
specifying the maximum depth of a tree or the minimum number of samples
|
||||||
|
at leaf nodes.
|
||||||
|
|
||||||
|
The forecaster must tune the structure of the forest.
|
||||||
|
Parameters include the number of trees in the forest, the size of the random
|
||||||
|
subset of features, and the pruning criteria.
|
||||||
|
The parameters are optimized via grid search: We train many models with
|
||||||
|
parameters chosen from a pre-defined list of values and select the best
|
||||||
|
one by CV.
|
||||||
|
RFs are a convenient ML method for any dataset as decision trees do not
|
||||||
|
make any assumptions about the relationship between features and labels.
|
||||||
|
\cite{herrera2010} use RFs to predict the hourly demand for water in an urban
|
||||||
|
context, a similar application as the one in this paper, and find that RFs
|
||||||
|
work well with time series type of data.
|
60
tex/2_lit/3_ml/5_svm.tex
Normal file
60
tex/2_lit/3_ml/5_svm.tex
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
\subsubsection{Support Vector Regression}
|
||||||
|
\label{svm}
|
||||||
|
|
||||||
|
\cite{vapnik1963} and \cite{vapnik1964} introduce the so-called support vector
|
||||||
|
machine (\gls{svm}) model, and \cite{vapnik2013} summarizes the research
|
||||||
|
conducted since then.
|
||||||
|
In its basic version, SVMs are linear classifiers, modeling a binary
|
||||||
|
decision, that fit a hyperplane into the feature space of $\mat{X}$ to
|
||||||
|
maximize the margin around the hyperplane seperating the two groups of
|
||||||
|
labels.
|
||||||
|
SVMs were popularized in the 1990s in the context of optical character
|
||||||
|
recognition, as shown in \cite{scholkopf1998}.
|
||||||
|
|
||||||
|
\cite{drucker1997} and \cite{stitson1999} adapt SVMs to the regression case,
|
||||||
|
and \cite{smola2004} provide a comprehensive introduction thereof.
|
||||||
|
\cite{mueller1997} and \cite{mueller1999} focus on SVRs in the context of time
|
||||||
|
series data and find that they tend to outperform classical methods.
|
||||||
|
\cite{chen2006a} and \cite{chen2006b} apply SVRs to predict the hourly demand
|
||||||
|
for water in cities, an application similar to the UDP case.
|
||||||
|
|
||||||
|
In the SVR case, a linear function
|
||||||
|
$\hat{y}_i = f(\vec{x}_i) = \langle\vec{w},\vec{x}_i\rangle + b$
|
||||||
|
is fitted so that the actual labels $y_i$ have a deviation of at most
|
||||||
|
$\epsilon$ from their predictions $\hat{y}_i$ (cf., the constraints
|
||||||
|
below).
|
||||||
|
SVRs are commonly formulated as quadratic optimization problems as follows:
|
||||||
|
$$
|
||||||
|
\text{minimize }
|
||||||
|
\frac{1}{2} \norm{\vec{w}}^2 + C \sum_{i=1}^m (\xi_i + \xi_i^*)
|
||||||
|
\quad \text{subject to }
|
||||||
|
\begin{cases}
|
||||||
|
y_i - \langle \vec{w}, \vec{x}_i \rangle - b \leq \epsilon + \xi_i
|
||||||
|
\text{,} \\
|
||||||
|
\langle \vec{w}, \vec{x}_i \rangle + b - y_i \leq \epsilon + \xi_i^*
|
||||||
|
\end{cases}
|
||||||
|
$$
|
||||||
|
$\vec{w}$ are the fitted weights in the row space of $\mat{X}$, $b$ is a bias
|
||||||
|
term in the column space of $\mat{X}$, and $\langle\cdot,\cdot\rangle$
|
||||||
|
denotes the dot product.
|
||||||
|
By minimizing the norm of $\vec{w}$, the fitted function is flat and not prone
|
||||||
|
to overfitting strongly.
|
||||||
|
To allow individual samples outside the otherwise hard $\epsilon$ bounds,
|
||||||
|
non-negative slack variables $\xi_i$ and $\xi_i^*$ are included.
|
||||||
|
A non-negative parameter $C$ regulates how many samples may violate the
|
||||||
|
$\epsilon$ bounds and by how much.
|
||||||
|
To model non-linear relationships, one could use a mapping $\Phi(\cdot)$ for
|
||||||
|
the $\vec{x}_i$ from the row space of $\mat{X}$ to some higher
|
||||||
|
dimensional space; however, as the optimization problem only depends on
|
||||||
|
the dot product $\langle\cdot,\cdot\rangle$ and not the actual entries of
|
||||||
|
$\vec{x}_i$, it suffices to use a kernel function $k$ such that
|
||||||
|
$k(\vec{x}_i,\vec{x}_j) = \langle\Phi(\vec{x}_i),\Phi(\vec{x}_j)\rangle$.
|
||||||
|
Such kernels must fulfill certain mathematical properties, and, besides
|
||||||
|
polynomial kernels, radial basis functions with
|
||||||
|
$k(\vec{x}_i,\vec{x}_j) = exp(\gamma \norm{\vec{x}_i - \vec{x}_j}^2)$ are
|
||||||
|
a popular candidate where $\gamma$ is a parameter controlling for how the
|
||||||
|
distances between any two samples influence the final model.
|
||||||
|
SVRs work well with sparse data in high dimensional spaces, such as
|
||||||
|
intermittent demand data, as they minimize the risk of misclassification
|
||||||
|
or predicting a significantly far off value by maximizing the error
|
||||||
|
margin, as also noted by \cite{bao2004}.
|
6
tex/3_mod/1_intro.tex
Normal file
6
tex/3_mod/1_intro.tex
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
\section{Model Formulation}
|
||||||
|
\label{mod}
|
||||||
|
|
||||||
|
In this section, we describe how the platform's raw data are pre-processed
|
||||||
|
into model inputs and how the forecasting models are built and benchmarked
|
||||||
|
against each other.
|
28
tex/3_mod/2_overall.tex
Normal file
28
tex/3_mod/2_overall.tex
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
\subsection{Overall Approach}
|
||||||
|
\label{approach_approach}
|
||||||
|
|
||||||
|
On a conceptual level, there are three distinct aspects of the model
|
||||||
|
development process.
|
||||||
|
First, a pre-processing step transforms the platform's tabular order data into
|
||||||
|
either time series in Sub-section \ref{grid} or feature matrices in
|
||||||
|
Sub-section \ref{ml_models}.
|
||||||
|
Second, a benchmark methodology is developed in Sub-section \ref{unified_cv}
|
||||||
|
that compares all models on the same scale, in particular, classical
|
||||||
|
models with ML ones.
|
||||||
|
Concretely, the CV approach is adapted to the peculiar requirements of
|
||||||
|
sub-daily and ordinal time series data.
|
||||||
|
This is done to maximize the predictive power of all models into the future
|
||||||
|
and to compare them on the same scale.
|
||||||
|
Third, the forecasting models are described with respect to their assumptions
|
||||||
|
and training requirements.
|
||||||
|
Four classification dimensions are introduced:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Timeliness of the Information}:
|
||||||
|
whole-day-ahead vs. real-time forecasts
|
||||||
|
\item \textbf{Time Series Decomposition}: raw vs. decomposed
|
||||||
|
\item \textbf{Algorithm Type}: "classical" statistics vs. ML
|
||||||
|
\item \textbf{Data Sources}: pure vs. enhanced (i.e., with external data)
|
||||||
|
\end{enumerate}
|
||||||
|
Not all of the possible eight combinations are implemented; instead, the
|
||||||
|
models are varied along these dimensions to show different effects and
|
||||||
|
answer the research questions.
|
97
tex/3_mod/3_grid.tex
Normal file
97
tex/3_mod/3_grid.tex
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
\subsection{Gridification, Time Tables, and Time Series Generation}
|
||||||
|
\label{grid}
|
||||||
|
|
||||||
|
The platform's tabular order data are sliced with respect to both location and
|
||||||
|
time and then aggregated into time series where an observation tells
|
||||||
|
the number of orders in an area for a time step/interval.
|
||||||
|
Figure \ref{f:grid} shows how the orders' delivery locations are each
|
||||||
|
matched to a square-shaped cell, referred to as a pixel, on a grid
|
||||||
|
covering the entire service area within a city.
|
||||||
|
This gridification step is also applied to the pickup locations separately.
|
||||||
|
The lower-left corner is chosen at random.
|
||||||
|
Applications of this gridification idea to model location-routing problems
|
||||||
|
can be viewed, for example, in \cite{winkenbach2015}, \cite{bergmann2020},
|
||||||
|
\cite{janjevic2019}, \cite{snoeck2020}, and \cite{janjevic2020}
|
||||||
|
while \cite{singleton2017} portray it as a standard method in the field of
|
||||||
|
urban analytics.
|
||||||
|
With increasing pixel sizes, the time series exhibit more order aggregation
|
||||||
|
with a possibly stronger demand pattern.
|
||||||
|
On the other hand, the larger the pixels, the less valuable become the
|
||||||
|
generated forecasts as, for example, a courier sent to a pixel
|
||||||
|
preemptively then faces a longer average distance to a restaurant in the
|
||||||
|
pixel.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{figure}{Gridification for delivery locations in Paris with a pixel
|
||||||
|
size of $1~\text{km}^2$}
|
||||||
|
\label{f:grid}
|
||||||
|
\includegraphics[width=.8\linewidth]{static/gridification_for_paris_gray.png}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
After gridification, the ad-hoc orders within a pixel are aggregated by their
|
||||||
|
placement timestamps into sub-daily time steps of pre-defined lengths
|
||||||
|
to obtain a time table as exemplified in Figure \ref{f:timetable} with
|
||||||
|
one-hour intervals.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{figure}{Aggregation into a time table with hourly time steps}
|
||||||
|
\label{f:timetable}
|
||||||
|
\begin{tabular}{|c||*{9}{c|}}
|
||||||
|
\hline
|
||||||
|
\backslashbox{Time}{Day} & \makebox[2em]{\ldots}
|
||||||
|
& \makebox[3em]{Mon} & \makebox[3em]{Tue}
|
||||||
|
& \makebox[3em]{Wed} & \makebox[3em]{Thu}
|
||||||
|
& \makebox[3em]{Fri} & \makebox[3em]{Sat}
|
||||||
|
& \makebox[3em]{Sun} & \makebox[2em]{\ldots} \\
|
||||||
|
\hline
|
||||||
|
\hline
|
||||||
|
11:00 & \ldots & $y_{11,Mon}$ & $y_{11,Tue}$ & $y_{11,Wed}$ & $y_{11,Thu}$
|
||||||
|
& $y_{11,Fri}$ & $y_{11,Sat}$ & $y_{11,Sun}$ & \ldots \\
|
||||||
|
\hline
|
||||||
|
12:00 & \ldots & $y_{12,Mon}$ & $y_{12,Tue}$ & $y_{12,Wed}$ & $y_{12,Thu}$
|
||||||
|
& $y_{12,Fri}$ & $y_{12,Sat}$ & $y_{12,Sun}$ & \ldots \\
|
||||||
|
\hline
|
||||||
|
\ldots & \ldots & \ldots & \ldots & \ldots
|
||||||
|
& \ldots & \ldots & \ldots & \ldots & \ldots \\
|
||||||
|
\hline
|
||||||
|
20:00 & \ldots & $y_{20,Mon}$ & $y_{20,Tue}$ & $y_{20,Wed}$ & $y_{20,Thu}$
|
||||||
|
& $y_{20,Fri}$ & $y_{20,Sat}$ & $y_{20,Sun}$ & \ldots \\
|
||||||
|
\hline
|
||||||
|
21:00 & \ldots & $y_{21,Mon}$ & $y_{21,Tue}$ & $y_{21,Wed}$ & $y_{21,Thu}$
|
||||||
|
& $y_{21,Fri}$ & $y_{21,Sat}$ & $y_{21,Sun}$ & \ldots \\
|
||||||
|
\hline
|
||||||
|
\ldots & \ldots & \ldots & \ldots & \ldots
|
||||||
|
& \ldots & \ldots & \ldots & \ldots & \ldots \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
Consequently, each $y_{t,d}$ in Figure \ref{f:timetable} is the number of
|
||||||
|
all orders within the pixel for the time of day $t$ and day of week
|
||||||
|
$d$ ($y_t$ and $y_{t,d}$ are the same but differ in that the latter
|
||||||
|
acknowledges a 2D view).
|
||||||
|
The same trade-off as with gridification applies:
|
||||||
|
The shorter the interval, the weaker is the demand pattern to be expected in
|
||||||
|
the time series due to less aggregation while longer intervals lead to
|
||||||
|
less usable forecasts.
|
||||||
|
We refer to time steps by their start time, and their number per day, $H$,
|
||||||
|
is constant.
|
||||||
|
Given a time table as in Figure \ref{f:timetable} there are two ways to
|
||||||
|
generate a time series by slicing:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Horizontal View}:
|
||||||
|
Take only the order counts for a given time of the day
|
||||||
|
\item \textbf{Vertical View}:
|
||||||
|
Take all order counts and remove the double-seasonal pattern induced
|
||||||
|
by the weekday and time of the day with decomposition
|
||||||
|
\end{enumerate}
|
||||||
|
Distinct time series are retrieved by iterating through the time tables either
|
||||||
|
horizontally or vertically in increments of a single time step.
|
||||||
|
Another property of a generated time series is its length, which, following
|
||||||
|
the next sub-section, can be interpreted as the sum of the production
|
||||||
|
training set and the test day.
|
||||||
|
In summary, a distinct time series is generated from the tabular order data
|
||||||
|
based on a configuration of parameters for the dimensions pixel size,
|
||||||
|
number of daily time steps $H$, shape (horizontal vs. vertical), length,
|
||||||
|
and the time step to be predicted.
|
86
tex/3_mod/4_cv.tex
Normal file
86
tex/3_mod/4_cv.tex
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
\subsection{Unified Cross-Validation and Training, Validation, and Test Sets}
|
||||||
|
\label{unified_cv}
|
||||||
|
|
||||||
|
The standard $k$-fold CV, which assumes no structure in the individual
|
||||||
|
features of the samples, as shown in $\mat{X}$ above, is adapted to the
|
||||||
|
ordinal character of time series data:
|
||||||
|
A model must be evaluated on observations that occurred strictly after the
|
||||||
|
ones used for training as, otherwise, the model knows about the future.
|
||||||
|
Furthermore, some models predict only a single to a few time steps before
|
||||||
|
being retrained, while others predict an entire day without retraining
|
||||||
|
(cf., Sub-section \ref{ml_models}).
|
||||||
|
Consequently, we must use a unified time interval wherein all forecasts are
|
||||||
|
made first before the entire interval is evaluated.
|
||||||
|
As whole days are the longest prediction interval for models without
|
||||||
|
retraining, we choose that as the unified time interval.
|
||||||
|
In summary, our CV methodology yields a distinct best model per pixel and day
|
||||||
|
to be forecast.
|
||||||
|
Whole days are also practical for managers who commonly monitor, for example,
|
||||||
|
the routing and thus the forecasting performance on a day-to-day basis.
|
||||||
|
Our methodology assumes that the models are trained at least once per day.
|
||||||
|
As we create operational forecasts into the near future in this paper,
|
||||||
|
retraining all models with the latest available data is a logical step.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{figure}{Training, validation, and test sets
|
||||||
|
during cross validation}
|
||||||
|
\label{f:cv}
|
||||||
|
\includegraphics[width=.8\linewidth]{static/cross_validation_gray.png}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
The training, validation, and test sets are defined as follows.
|
||||||
|
To exemplify the logic, we refer to Figure \ref{f:cv} that shows the calendar
|
||||||
|
setup (i.e., weekdays on the x-axis) for three days $T_1$, $T_2$, and
|
||||||
|
$T_3$ (shown in dark gray) for which we generate forecasts.
|
||||||
|
Each of these days is, by definition, a test day, and the test set comprises
|
||||||
|
all time series, horizontal or vertical, whose last observation lies on
|
||||||
|
that day.
|
||||||
|
With an assumed training horizon of three weeks, the 21 days before each of
|
||||||
|
the test days constitute the corresponding training sets (shown in lighter
|
||||||
|
gray on the same rows as $T_1$, $T_2$, and $T_3$).
|
||||||
|
There are two kinds of validation sets, depending on the decision to be made.
|
||||||
|
First, if a forecasting method needs parameter tuning, the original training
|
||||||
|
set is divided into as many equally long series as validation days are
|
||||||
|
needed to find stable parameters.
|
||||||
|
The example shows three validation days per test day named $V_n$ (shown
|
||||||
|
in darker gray below each test day).
|
||||||
|
The $21 - 3 = 18$ preceding days constitute the training set corresponding to
|
||||||
|
a validation day.
|
||||||
|
To obtain the overall validation error, the three errors are averaged.
|
||||||
|
We call these \textit{inner} validation sets because they must be repeated
|
||||||
|
each day to re-tune the parameters and because the involved time series
|
||||||
|
are true subsets of the original series.
|
||||||
|
Second, to find the best method per day and pixel, the same averaging logic
|
||||||
|
is applied on the outer level.
|
||||||
|
For example, if we used two validation days to find the best method for $T_3$,
|
||||||
|
we would average the errors of $T_1$ and $T_2$ for each method and select
|
||||||
|
the winner; then, $T_1$ and $T_2$ constitute an \textit{outer} validation
|
||||||
|
set.
|
||||||
|
Whereas the number of inner validation days is method-specific and must be
|
||||||
|
chosen before generating any test day forecasts in the first place, the
|
||||||
|
number of outer validation days may be varied after the fact and is
|
||||||
|
determined empirically as we show in Section \ref{stu}.
|
||||||
|
|
||||||
|
Our unified CV approach is also optimized for large-scale production settings,
|
||||||
|
for example, at companies like Uber.
|
||||||
|
As \cite{bell2018} note, there is a trade-off as to when each of the
|
||||||
|
inner time series in the example begins.
|
||||||
|
While the forecasting accuracy likely increases with more training days,
|
||||||
|
supporting inner series with increasing lengths, cutting the series
|
||||||
|
to the same length allows caching the forecasts and errors.
|
||||||
|
In the example, $V_3$, $V_5$, and $V_7$, as well as $V_6$ and $V_8$ are
|
||||||
|
identical despite belonging to different inner validation sets.
|
||||||
|
Caching is also possible on the outer level when searching for an optimal
|
||||||
|
number of validation days for model selection.
|
||||||
|
We achieved up to 80\% cache hit ratios in our implementation in the
|
||||||
|
empirical study, thereby saving computational resources by the same
|
||||||
|
amount.
|
||||||
|
Lastly, we assert that our suggested CV, because of its being unified
|
||||||
|
around whole test days and usage of fix-sized time series, is also
|
||||||
|
suitable for creating consistent learning curves and, thus, answering
|
||||||
|
\textbf{Q3} on the relationship between forecast accuracy and amount of
|
||||||
|
historic data:
|
||||||
|
We simply increase the length of the outer training set holding the test day
|
||||||
|
fixed.
|
||||||
|
Thus, independent of a method's need for parameter tuning, all methods have
|
||||||
|
the same demand history available for each test day forecast.
|
87
tex/3_mod/5_mase.tex
Normal file
87
tex/3_mod/5_mase.tex
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
\subsection{Accuracy Measures}
|
||||||
|
\label{mase}
|
||||||
|
|
||||||
|
Choosing an error measure for both model selection and evaluation is not
|
||||||
|
straightforward when working with intermittent demand, as shown, for
|
||||||
|
example, by \cite{syntetos2005}, and one should understand the trade-offs
|
||||||
|
between measures.
|
||||||
|
\cite{hyndman2006} provide a study of measures with real-life data taken from
|
||||||
|
the popular M3-competition and find that most standard measures degenerate
|
||||||
|
under many scenarios.
|
||||||
|
They also provide a classification scheme for which we summarize the main
|
||||||
|
points as they apply to the UDP case:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Scale-dependent Errors}:
|
||||||
|
The error is reported in the same unit as the raw data.
|
||||||
|
Two popular examples are the root mean square error (RMSE) and mean absolute
|
||||||
|
error (MAE).
|
||||||
|
They may be used for model selection and evaluation within a pixel, and are
|
||||||
|
intuitively interpretable; however, they may not be used to compare errors
|
||||||
|
of, for example, a low-demand pixel (e.g., at the UDP's service
|
||||||
|
boundary) with that of a high-demand pixel (e.g., downtown).
|
||||||
|
\item \textbf{Percentage Errors}:
|
||||||
|
The error is derived from the percentage errors of individual forecasts per
|
||||||
|
time step, and is also intuitively interpretable.
|
||||||
|
A popular example is the mean absolute percentage error (MAPE) that is the
|
||||||
|
primary measure in most forecasting competitions.
|
||||||
|
Whereas such errors could be applied both within and across pixels, they
|
||||||
|
cannot be calculated reliably for intermittent demand.
|
||||||
|
If only one time step exhibits no demand, the result is a divide-by-zero
|
||||||
|
error.
|
||||||
|
This often occurs even in high-demand pixels due to the slicing.
|
||||||
|
\item \textbf{Relative Errors}:
|
||||||
|
A workaround is to calculate a scale-dependent error for the test day and
|
||||||
|
divide it by the same measure calculated with forecasts of a simple
|
||||||
|
benchmark method (e.g., na\"{i}ve method).
|
||||||
|
An example could be
|
||||||
|
$\text{RelMAE} = \text{MAE} / \text{MAE}_\text{bm}$.
|
||||||
|
Nevertheless, even simple methods create (near-)perfect forecasts, and then
|
||||||
|
$\text{MAE}_\text{bm}$ becomes (close to) $0$.
|
||||||
|
These numerical instabilities occurred so often in our studies that we argue
|
||||||
|
against using such measures.
|
||||||
|
\item \textbf{Scaled Errors}:
|
||||||
|
\cite{hyndman2006} contribute this category and introduce the mean absolute
|
||||||
|
scaled error (\gls{mase}).
|
||||||
|
It is defined as the MAE from the actual forecasting method on the test day
|
||||||
|
(i.e., "out-of-sample") divided by the MAE from the (seasonal) na\"{i}ve
|
||||||
|
method on the entire training set (i.e., "in-sample").
|
||||||
|
A MASE of $1$ indicates that a forecasting method has the same accuracy
|
||||||
|
on the test day as the (seasonal) na\"{i}ve method applied on a longer
|
||||||
|
horizon, and lower values imply higher accuracy.
|
||||||
|
Within a pixel, its results are identical to the ones obtained with MAE.
|
||||||
|
Also, we acknowledge recent publications, for example, \cite{prestwich2014} or
|
||||||
|
\cite{kim2016}, showing other ways of tackling the difficulties mentioned.
|
||||||
|
However, only the MASE provided numerically stable results for all
|
||||||
|
forecasts in our study.
|
||||||
|
\end{enumerate}
|
||||||
|
Consequently, we use the MASE with a seasonal na\"{i}ve benchmark as the
|
||||||
|
primary measure in this paper.
|
||||||
|
With the previously introduced notation, it is defined as follows:
|
||||||
|
$$
|
||||||
|
\text{MASE}
|
||||||
|
:=
|
||||||
|
\frac{\text{MAE}_{\text{out-of-sample}}}{\text{MAE}_{\text{in-sample}}}
|
||||||
|
=
|
||||||
|
\frac{\text{MAE}_{\text{forecasts}}}{\text{MAE}_{\text{training}}}
|
||||||
|
=
|
||||||
|
\frac{\frac{1}{H} \sum_{h=1}^H |y_{T+h} - \hat{y}_{T+h}|}
|
||||||
|
{\frac{1}{T-k} \sum_{t=k+1}^T |y_{t} - y_{t-k}|}
|
||||||
|
$$
|
||||||
|
The denominator can only become $0$ if the seasonal na\"{i}ve benchmark makes
|
||||||
|
a perfect forecast on each day in the training set except the first seven
|
||||||
|
days, which never happened in our case study involving hundreds of
|
||||||
|
thousands of individual model trainings.
|
||||||
|
Further, as per the discussion in the subsequent Section \ref{decomp}, we also
|
||||||
|
calculate peak-MASEs where we leave out the time steps of non-peak times
|
||||||
|
from the calculations.
|
||||||
|
For this analysis, we define all time steps that occur at lunch (i.e., noon to
|
||||||
|
2 pm) and dinner time (i.e., 6 pm to 8 pm) as peak.
|
||||||
|
As time steps in non-peak times typically average no or very low order counts,
|
||||||
|
a UDP may choose to not actively forecast these at all and be rather
|
||||||
|
interested in the accuracies of forecasting methods during peaks only.
|
||||||
|
|
||||||
|
We conjecture that percentage error measures may be usable for UDPs facing a
|
||||||
|
higher overall demand with no intra-day down-times in between but have to
|
||||||
|
leave that to a future study.
|
||||||
|
Yet, even with high and steady demand, divide-by-zero errors are likely to
|
||||||
|
occur.
|
76
tex/3_mod/6_decomp.tex
Normal file
76
tex/3_mod/6_decomp.tex
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
\subsection{Time Series Decomposition}
|
||||||
|
\label{decomp}
|
||||||
|
|
||||||
|
Concerning the time table in Figure \ref{f:timetable}, a seasonal demand
|
||||||
|
pattern is inherent to both horizontal and vertical time series.
|
||||||
|
First, the weekday influences if people eat out or order in with our partner
|
||||||
|
receiving more orders on Thursday through Saturday than the other four
|
||||||
|
days.
|
||||||
|
This pattern is part of both types of time series.
|
||||||
|
Second, on any given day, demand peaks occur around lunch and dinner times.
|
||||||
|
This only regards vertical series.
|
||||||
|
Statistical analyses show that horizontally sliced time series indeed exhibit
|
||||||
|
a periodicity of $k=7$, and vertically sliced series only yield a seasonal
|
||||||
|
component with a regular pattern if the periodicity is set to the product
|
||||||
|
of the number of weekdays and the daily time steps indicating a distinct
|
||||||
|
intra-day pattern per weekday.
|
||||||
|
|
||||||
|
Figure \ref{f:stl} shows three exemplary STL decompositions for a
|
||||||
|
$1~\text{km}^2$ pixel and a vertical time series with 60-minute time steps
|
||||||
|
(on the x-axis) covering four weeks:
|
||||||
|
With the noisy raw data $y_t$ on the left, the seasonal and trend components,
|
||||||
|
$s_t$ and $t_t$, are depicted in light and dark gray for increasing $ns$
|
||||||
|
parameters.
|
||||||
|
The plots include (seasonal) na\"{i}ve forecasts for the subsequent test day
|
||||||
|
as dotted lines.
|
||||||
|
The remainder components $r_t$ are not shown for conciseness.
|
||||||
|
The periodicity is set to $k = 7 * 12 = 84$ as our industry partner has $12$
|
||||||
|
opening hours per day.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{figure}{STL decompositions for a medium-demand pixel with hourly
|
||||||
|
time steps and periodicity $k=84$}
|
||||||
|
\label{f:stl}
|
||||||
|
\includegraphics[width=.95\linewidth]{static/stl_gray.png}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
As described in Sub-section \ref{stl}, with $k$ being implied by the
|
||||||
|
application, at the very least, the length of the seasonal smoothing
|
||||||
|
window, represented by the $ns$ parameter, must be calibrated by the
|
||||||
|
forecaster:
|
||||||
|
It controls how many past observations go into each smoothened $s_t$.
|
||||||
|
Many practitioners, however, skip this step and set $ns$ to a big number, for
|
||||||
|
example, $999$, then referred to as "periodic."
|
||||||
|
For the other parameters, it is common to use the default values as
|
||||||
|
specified in \cite{cleveland1990}.
|
||||||
|
The goal is to find a decomposition with a regular pattern in $s_t$.
|
||||||
|
In Figure \ref{f:stl}, this is not true for $ns=7$ where, for
|
||||||
|
example, the four largest bars corresponding to the same time of day a
|
||||||
|
week apart cannot be connected by an approximately straight line.
|
||||||
|
On the contrary, a regular pattern in the most extreme way exists for
|
||||||
|
$ns=999$, where the same four largest bars are of the same height.
|
||||||
|
This observation holds for each time step of the day.
|
||||||
|
For $ns=11$, $s_t$ exhibits a regular pattern whose bars adapt over time:
|
||||||
|
The pattern is regular as bars corresponding to the same time of day can be
|
||||||
|
connected by approximately straight lines, and it is adaptive as these
|
||||||
|
lines are not horizontal.
|
||||||
|
The trade-off between small and large values for $ns$ can thus be interpreted
|
||||||
|
as allowing the average demand during peak times to change over time:
|
||||||
|
If demand is intermittent at non-peak times, it is reasonable to expect the
|
||||||
|
bars to change over time as only the relative differences between peak and
|
||||||
|
non-peak times impact the bars' heights with the seasonal component being
|
||||||
|
centered around $0$.
|
||||||
|
To confirm the goodness of a decomposition statistically, one way is to verify
|
||||||
|
that $r_t$ can be modeled as a typical error process like white noise
|
||||||
|
$\epsilon_t$.
|
||||||
|
|
||||||
|
However, we suggest an alternative way of calibrating the STL method in an
|
||||||
|
automated fashion based on our unified CV approach.
|
||||||
|
As hinted at in Figure \ref{f:stl}, we interpret an STL decomposition as a
|
||||||
|
forecasting method on its own by just adding the (seasonal) na\"{i}ve
|
||||||
|
forecasts for $s_t$ and $t_t$ and predicting $0$ for $r_t$.
|
||||||
|
Then, the $ns$ parameter is tuned just like a parameter for an ML model.
|
||||||
|
To the best of our knowledge, this has not yet been proposed before.
|
||||||
|
Conceptually, forecasting with the STL method can be viewed as a na\"{i}ve
|
||||||
|
method with built-in smoothing, and it outperformed all other
|
||||||
|
benchmark methods in all cases.
|
20
tex/3_mod/7_models/1_intro.tex
Normal file
20
tex/3_mod/7_models/1_intro.tex
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
\subsection{Forecasting Models}
|
||||||
|
\label{models}
|
||||||
|
|
||||||
|
This sub-section describes the concrete models in our study.
|
||||||
|
Figure \ref{f:inputs} shows how we classify them into four families with
|
||||||
|
regard to the type of the time series, horizontal or vertical, and the
|
||||||
|
moment at which a model is trained:
|
||||||
|
Solid lines indicate that the corresponding time steps lie before the
|
||||||
|
training, and dotted lines show the time horizon predicted by a model.
|
||||||
|
For conciseness, we only show the forecasts for one test day.
|
||||||
|
The setup is the same for each inner validation day.
|
||||||
|
|
||||||
|
\
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{figure}{Classification of the models by input type and training
|
||||||
|
moment}
|
||||||
|
\label{f:inputs}
|
||||||
|
\includegraphics[width=.95\linewidth]{static/model_inputs_gray.png}
|
||||||
|
\end{center}
|
42
tex/3_mod/7_models/2_hori.tex
Normal file
42
tex/3_mod/7_models/2_hori.tex
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
\subsubsection{Horizontal and Whole-day-ahead Forecasts}
|
||||||
|
\label{hori}
|
||||||
|
|
||||||
|
The upper-left in Figure \ref{f:inputs} illustrates the simplest way to
|
||||||
|
generate forecasts for a test day before it has started:
|
||||||
|
For each time of the day, the corresponding horizontal slice becomes the input
|
||||||
|
for a model.
|
||||||
|
With whole days being the unified time interval, each model is trained $H$
|
||||||
|
times, providing a one-step-ahead forecast.
|
||||||
|
While it is possible to have models of a different type be selected per time
|
||||||
|
step, that did not improve the accuracy in the empirical study.
|
||||||
|
As the models in this family do not include the test day's demand data in
|
||||||
|
their training sets, we see them as benchmarks to answer \textbf{Q4},
|
||||||
|
checking if a UDP can take advantage of real-time information.
|
||||||
|
The models in this family are as follows; we use prefixes, such as "h" here,
|
||||||
|
when methods are applied in other families as well:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textit{\gls{naive}}:
|
||||||
|
Observation from the same time step one week prior
|
||||||
|
\item \textit{\gls{trivial}}:
|
||||||
|
Predict $0$ for all time steps
|
||||||
|
\item \textit{\gls{hcroston}}:
|
||||||
|
Intermittent demand method introduced by \cite{croston1972}
|
||||||
|
\item \textit{\gls{hholt}},
|
||||||
|
\textit{\gls{hhwinters}},
|
||||||
|
\textit{\gls{hses}},
|
||||||
|
\textit{\gls{hsma}}, and
|
||||||
|
\textit{\gls{htheta}}:
|
||||||
|
Exponential smoothing without calibration
|
||||||
|
\item \textit{\gls{hets}}:
|
||||||
|
ETS calibrated as described by \cite{hyndman2008b}
|
||||||
|
\item \textit{\gls{harima}}:
|
||||||
|
ARIMA calibrated as described by \cite{hyndman2008a}
|
||||||
|
\end{enumerate}
|
||||||
|
\textit{naive} and \textit{trivial} provide an absolute benchmark for the
|
||||||
|
actual forecasting methods.
|
||||||
|
\textit{hcroston} is often mentioned in the context of intermittent demand;
|
||||||
|
however, the method did not perform well at all.
|
||||||
|
Besides \textit{hhwinters} that always fits a seasonal component, the
|
||||||
|
calibration heuristics behind \textit{hets} and \textit{harima} may do so
|
||||||
|
as well.
|
||||||
|
With $k=7$, an STL decomposition is unnecessary here.
|
39
tex/3_mod/7_models/3_vert.tex
Normal file
39
tex/3_mod/7_models/3_vert.tex
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
\subsubsection{Vertical and Whole-day-ahead Forecasts without Retraining}
|
||||||
|
\label{vert}
|
||||||
|
|
||||||
|
The upper-right in Figure \ref{f:inputs} shows an alternative way to
|
||||||
|
generate forecasts for a test day before it has started:
|
||||||
|
First, a seasonally-adjusted time series $a_t$ is obtained from a vertical
|
||||||
|
time series by STL decomposition.
|
||||||
|
Then, the actual forecasting model, trained on $a_t$, makes an $H$-step-ahead
|
||||||
|
prediction.
|
||||||
|
Lastly, we add the $H$ seasonal na\"{i}ve forecasts for the seasonal component
|
||||||
|
$s_t$ to them to obtain the actual predictions for the test day.
|
||||||
|
Thus, only one training is required per model type, and no real-time data is
|
||||||
|
used.
|
||||||
|
By decomposing the raw time series, all long-term patterns are assumed to be
|
||||||
|
in the seasonal component $s_t$, and $a_t$ only contains the level with
|
||||||
|
a potential trend and auto-correlations.
|
||||||
|
The models in this family are:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textit{\gls{fnaive}},
|
||||||
|
\textit{\gls{pnaive}}:
|
||||||
|
Sum of STL's trend and seasonal components' na\"{i}ve forecasts
|
||||||
|
\item \textit{\gls{vholt}},
|
||||||
|
\textit{\gls{vses}}, and
|
||||||
|
\textit{\gls{vtheta}}:
|
||||||
|
Exponential smoothing without calibration and seasonal
|
||||||
|
fit
|
||||||
|
\item \textit{\gls{vets}}:
|
||||||
|
ETS calibrated as described by \cite{hyndman2008b}
|
||||||
|
\item \textit{\gls{varima}}:
|
||||||
|
ARIMA calibrated as described by \cite{hyndman2008a}
|
||||||
|
\end{enumerate}
|
||||||
|
As mentioned in Sub-section \ref{unified_cv}, we include the sum of the
|
||||||
|
(seasonal) na\"{i}ve forecasts of the STL's trend and seasonal components
|
||||||
|
as forecasts on their own:
|
||||||
|
For \textit{fnaive}, we tune the "flexible" $ns$ parameter, and for
|
||||||
|
\textit{pnaive}, we set it to a "periodic" value.
|
||||||
|
Thus, we implicitly assume that there is no signal in the remainder $r_t$, and
|
||||||
|
predict $0$ for it.
|
||||||
|
\textit{fnaive} and \textit{pnaive} are two more simple benchmarks.
|
22
tex/3_mod/7_models/4_rt.tex
Normal file
22
tex/3_mod/7_models/4_rt.tex
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
\subsubsection{Vertical and Real-time Forecasts with Retraining}
|
||||||
|
\label{rt}
|
||||||
|
|
||||||
|
The lower-left in Figure \ref{f:inputs} shows how models trained on vertical
|
||||||
|
time series are extended with real-time order data as it becomes available
|
||||||
|
during a test day:
|
||||||
|
Instead of obtaining an $H$-step-ahead forecast, we retrain a model after
|
||||||
|
every time step and only predict one step.
|
||||||
|
The remainder is as in the previous sub-section, and the models are:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textit{\gls{rtholt}},
|
||||||
|
\textit{\gls{rtses}}, and
|
||||||
|
\textit{\gls{rttheta}}:
|
||||||
|
Exponential smoothing without calibration and seasonal fit
|
||||||
|
\item \textit{\gls{rtets}}:
|
||||||
|
ETS calibrated as described by \cite{hyndman2008b}
|
||||||
|
\item \textit{\gls{rtarima}}:
|
||||||
|
ARIMA calibrated as described by \cite{hyndman2008a}
|
||||||
|
\end{enumerate}
|
||||||
|
Retraining \textit{fnaive} and \textit{pnaive} did not increase accuracy, and
|
||||||
|
thus we left them out.
|
||||||
|
A downside of this family is the significant increase in computing costs.
|
54
tex/3_mod/7_models/5_ml.tex
Normal file
54
tex/3_mod/7_models/5_ml.tex
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
\subsubsection{Vertical and Real-time Forecasts without Retraining}
|
||||||
|
\label{ml_models}
|
||||||
|
|
||||||
|
The lower-right in Figure \ref{f:inputs} shows how ML models take
|
||||||
|
real-time order data into account without retraining.
|
||||||
|
Based on the seasonally-adjusted time series $a_t$, we employ the feature
|
||||||
|
matrix and label vector representations from Sub-section \ref{learning}
|
||||||
|
and set $n$ to the number of daily time steps, $H$, to cover all potential
|
||||||
|
auto-correlations.
|
||||||
|
The ML models are trained once before a test day starts.
|
||||||
|
For training, the matrix and vector are populated such that $y_T$ is set to
|
||||||
|
the last time step of the day before the forecasts, $a_T$.
|
||||||
|
As the splitting during CV is done with whole days, the \gls{ml} models are
|
||||||
|
trained with training sets consisting of samples from all times of a day
|
||||||
|
in an equal manner.
|
||||||
|
Thus, the ML models learn to predict each time of the day.
|
||||||
|
For prediction on a test day, the $H$ observations preceding the time
|
||||||
|
step to be forecast are used as the input vector after seasonal
|
||||||
|
adjustment.
|
||||||
|
As a result, real-time data are included.
|
||||||
|
The models in this family are:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textit{\gls{vrfr}}: RF trained on the matrix as described
|
||||||
|
\item \textit{\gls{vsvr}}: SVR trained on the matrix as described
|
||||||
|
\end{enumerate}
|
||||||
|
We tried other ML models such as gradient boosting machines but found
|
||||||
|
only RFs and SVRs to perform well in our study.
|
||||||
|
In the case of gradient boosting machines, this is to be expected as they are
|
||||||
|
known not to perform well in the presence of high noise - as is natural
|
||||||
|
with low count data - as shown, for example, by \cite{ma2018} or
|
||||||
|
\cite{mason2000}.
|
||||||
|
Also, deep learning methods are not applicable as the feature matrices only
|
||||||
|
consist of several hundred to thousands of rows (cf., Sub-section
|
||||||
|
\ref{params}).
|
||||||
|
In \ref{tabular_ml_models}, we provide an alternative feature matrix
|
||||||
|
representation that exploits the two-dimensional structure of time tables
|
||||||
|
without decomposing the time series.
|
||||||
|
In \ref{enhanced_feats}, we show how feature matrices are extended
|
||||||
|
to include predictors other than historical order data.
|
||||||
|
However, to answer \textbf{Q5} already here, none of the external data sources
|
||||||
|
improves the results in our study.
|
||||||
|
Due to the high number of time series in our study, to investigate why
|
||||||
|
no external sources improve the forecasts, we must us some automated
|
||||||
|
approach to analyzing individual time series.
|
||||||
|
\cite{barbour2014} provide a spectral density estimation approach, called
|
||||||
|
the Shannon entropy, that measures the signal-to-noise ratio in a
|
||||||
|
database with a number normalized between 0 and 1 where lower values
|
||||||
|
indicate a higher signal-to-noise ratio.
|
||||||
|
We then looked at averages of the estimates on a daily level per pixel and
|
||||||
|
find that including any of the external data sources from
|
||||||
|
\ref{enhanced_feats} always leads to significantly lower signal-to-noise
|
||||||
|
ratios.
|
||||||
|
Thus, we conclude that at least for the demand faced by our industry partner
|
||||||
|
the historical data contains all of the signal.
|
6
tex/4_stu/1_intro.tex
Normal file
6
tex/4_stu/1_intro.tex
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
\section{Empirical Study: A Meal Delivery Platform in Europe}
|
||||||
|
\label{stu}
|
||||||
|
|
||||||
|
In the following, we first give a brief overview of the case study dataset
|
||||||
|
and the parameters we applied to calibrate the time series generation.
|
||||||
|
Then, we discuss the overall results.
|
23
tex/4_stu/2_data.tex
Normal file
23
tex/4_stu/2_data.tex
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
\subsection{Case Study Dataset}
|
||||||
|
\label{data}
|
||||||
|
|
||||||
|
The studied dataset consists of a meal delivery platform's entire
|
||||||
|
transactional data covering the French market from launch in February of
|
||||||
|
2016 to January of 2017.
|
||||||
|
The platform operated in five cities throughout this period and received a
|
||||||
|
total of 686,385 orders.
|
||||||
|
The forecasting models were developed based on the data from Lyon and Paris in
|
||||||
|
the period from August through December; this ensures comparability across
|
||||||
|
cities and avoids irregularities in demand assumed for a new service
|
||||||
|
within the first operating weeks.
|
||||||
|
The data exhibit a steady-state as the UDP's service area remained
|
||||||
|
unchanged, and the numbers of orders and of couriers grew in lock-step and
|
||||||
|
organically.
|
||||||
|
This does not mean that no new restaurants were openend: If that happened, the
|
||||||
|
new restaurant did not attract new customers, but demand was shifted from
|
||||||
|
other member restaurants.
|
||||||
|
Results are similar in both cities, and we only report them for Paris for
|
||||||
|
greater conciseness.
|
||||||
|
Lastly, the platform recorded all incoming orders, and lost demand does not
|
||||||
|
exist.
|
||||||
|
See \ref{dataset} for details on the raw data.
|
37
tex/4_stu/3_params.tex
Normal file
37
tex/4_stu/3_params.tex
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
\subsection{Calibration of the Time Series Generation Process}
|
||||||
|
\label{params}
|
||||||
|
|
||||||
|
Independent of the concrete forecasting models, the time series generation
|
||||||
|
must be calibrated.
|
||||||
|
We concentrate our forecasts on the pickup side for two reasons.
|
||||||
|
First, the restaurants come in a significantly lower number than the
|
||||||
|
customers resulting in more aggregation in the order counts and thus a
|
||||||
|
better pattern recognition.
|
||||||
|
Second, from an operational point of view, forecasts for the pickups are more
|
||||||
|
valuable because of the waiting times due to meal preparation.
|
||||||
|
We choose pixel sizes of $0.5~\text{km}^2$, $1~\text{km}^2$, $2~\text{km}^2$,
|
||||||
|
and $4~\text{km}^2$, and time steps covering 60, 90, and 120 minute windows
|
||||||
|
resulting in $H_{60}=12$, $H_{90}=9$, and $H_{120}=6$ time steps per day
|
||||||
|
with the platform operating between 11 a.m. and 11 p.m. and corresponding
|
||||||
|
frequencies $k_{60}=7*12=84$, $k_{90}=7*9=63$, and $k_{120}=7*6=42$ for the
|
||||||
|
vertical time series.
|
||||||
|
Smaller pixels and shorter time steps yield no recognizable patterns, yet would
|
||||||
|
have been more beneficial for tactical routing.
|
||||||
|
90 and 120 minute time steps are most likely not desirable for routing; however,
|
||||||
|
we keep them for comparison and note that a UDP may employ such forecasts
|
||||||
|
to activate more couriers at short notice if a (too) high demand is
|
||||||
|
forecasted in an hour from now.
|
||||||
|
This could, for example, be implemented by paying couriers a premium if they
|
||||||
|
show up for work at short notice.
|
||||||
|
Discrete lengths of 3, 4, 5, 6, 7, and 8 weeks are chosen as training
|
||||||
|
horizons.
|
||||||
|
We do so as the structure within the pixels (i.e., number and kind of
|
||||||
|
restaurants) is not stable for more than two months in a row in the
|
||||||
|
covered horizon.
|
||||||
|
That is confirmed by the empirical finding that forecasting accuracy
|
||||||
|
improves with longer training horizon but this effect starts to
|
||||||
|
level off after about six to seven weeks.
|
||||||
|
So, the demand patterns of more than two months ago do not resemble more
|
||||||
|
recent ones.
|
||||||
|
|
||||||
|
In total, 100,000s of distinct time series are forecast in the study.
|
240
tex/4_stu/4_overall.tex
Normal file
240
tex/4_stu/4_overall.tex
Normal file
|
@ -0,0 +1,240 @@
|
||||||
|
\subsection{Overall Results}
|
||||||
|
\label{overall_results}
|
||||||
|
|
||||||
|
Table \ref{t:results} summarizes the overall best-performing models grouped by
|
||||||
|
training horizon and a pixel's average daily demand (\gls{add}) for a
|
||||||
|
pixel size of $1~\text{km}^2$ and 60-minute time steps.
|
||||||
|
Each combination of pixel and test day counts as one case, and the total
|
||||||
|
number of cases is denoted as $n$.
|
||||||
|
Clustering the individual results revealed that a pixel's ADD over the
|
||||||
|
training horizon is the primary indicator of similarity and three to four
|
||||||
|
clusters suffice to obtain cohesive clusters:
|
||||||
|
We labeled them "no", "low", "medium", and "high" demand pixels with
|
||||||
|
increasing ADD, and present the average MASE per cluster.
|
||||||
|
The $n$ do not vary significantly across the training horizons, which confirms
|
||||||
|
that the platform did not grow area-wise and is indeed in a steady-state.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Top-3 models by training weeks and average demand
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps)}
|
||||||
|
\label{t:results}
|
||||||
|
\begin{tabular}{|c|c|*{12}{c|}}
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{\rotatebox{90}{\thead{Training}}}
|
||||||
|
& \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{No Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Low Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Medium Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{High Demand}} \\
|
||||||
|
~ & ~
|
||||||
|
& \multicolumn{3}{c|}{(0 - 2.5)}
|
||||||
|
& \multicolumn{3}{c|}{(2.5 - 10)}
|
||||||
|
& \multicolumn{3}{c|}{(10 - 25)}
|
||||||
|
& \multicolumn{3}{c|}{(25 - $\infty$)} \\
|
||||||
|
\cline{3-14}
|
||||||
|
~ & ~
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$ \\
|
||||||
|
|
||||||
|
\hline \hline
|
||||||
|
\multirow{3}{*}{3} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.785 & \multirow{3}{*}{\rotatebox{90}{4586}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.819 & \multirow{3}{*}{\rotatebox{90}{2975}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.839 & \multirow{3}{*}{\rotatebox{90}{2743}}
|
||||||
|
& \textbf{\textit{rtarima}}
|
||||||
|
& 0.872 & \multirow{3}{*}{\rotatebox{90}{2018}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.809 & ~
|
||||||
|
& \textit{hses} & 0.844 & ~
|
||||||
|
& \textit{hses} & 0.858 & ~
|
||||||
|
& \textit{rtses} & 0.873 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.958 & ~
|
||||||
|
& \textit{hets} & 0.846 & ~
|
||||||
|
& \textit{hets} & 0.859 & ~
|
||||||
|
& \textit{rtets} & 0.877 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{4} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.770 & \multirow{3}{*}{\rotatebox{90}{4532}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.825 & \multirow{3}{*}{\rotatebox{90}{3033}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.837 & \multirow{3}{*}{\rotatebox{90}{2687}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.855 & \multirow{3}{*}{\rotatebox{90}{2016}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.788 & ~
|
||||||
|
& \textit{hses} & 0.848 & ~
|
||||||
|
& \textit{hses} & 0.850 & ~
|
||||||
|
& \textbf{\textit{rtarima}} & 0.855 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.917 & ~
|
||||||
|
& \textit{hets} & 0.851 & ~
|
||||||
|
& \textit{hets} & 0.854 & ~
|
||||||
|
& \textit{rtses} & 0.860 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{5} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.780 & \multirow{3}{*}{\rotatebox{90}{4527}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.841 & \multirow{3}{*}{\rotatebox{90}{3055}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.837 & \multirow{3}{*}{\rotatebox{90}{2662}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.850 & \multirow{3}{*}{\rotatebox{90}{2019}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.803 & ~
|
||||||
|
& \textit{hses} & 0.859 & ~
|
||||||
|
& \textit{hets} & 0.845 & ~
|
||||||
|
& \textbf{\textit{rtarima}} & 0.852 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.889 & ~
|
||||||
|
& \textit{hets} & 0.861 & ~
|
||||||
|
& \textit{hses} & 0.845 & ~
|
||||||
|
& \textit{vsvr} & 0.854 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{6} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.741 & \multirow{3}{*}{\rotatebox{90}{4470}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.847 & \multirow{3}{*}{\rotatebox{90}{3086}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.840 & \multirow{3}{*}{\rotatebox{90}{2625}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.842 & \multirow{3}{*}{\rotatebox{90}{2025}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.766 & ~
|
||||||
|
& \textit{hses} & 0.863 & ~
|
||||||
|
& \textit{hets} & 0.842 & ~
|
||||||
|
& \textbf{\textit{hets}} & 0.847 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.837 & ~
|
||||||
|
& \textit{hets} & 0.865 & ~
|
||||||
|
& \textit{hses} & 0.848 & ~
|
||||||
|
& \textit{vsvr} & 0.848 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{7} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.730 & \multirow{3}{*}{\rotatebox{90}{4454}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.858 & \multirow{3}{*}{\rotatebox{90}{3132}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.845 & \multirow{3}{*}{\rotatebox{90}{2597}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.840 & \multirow{3}{*}{\rotatebox{90}{2007}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.754 & ~
|
||||||
|
& \textit{hses} & 0.871 & ~
|
||||||
|
& \textit{hsma} & 0.847 & ~
|
||||||
|
& \textbf{\textit{vrfr}} & 0.845 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.813 & ~
|
||||||
|
& \textit{hets} & 0.872 & ~
|
||||||
|
& \textbf{\textit{vsvr}} & 0.850 & ~
|
||||||
|
& \textit{vsvr} & 0.847 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{8} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.735 & \multirow{3}{*}{\rotatebox{90}{4402}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.867 & \multirow{3}{*}{\rotatebox{90}{3159}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.846 & \multirow{3}{*}{\rotatebox{90}{2575}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.836 & \multirow{3}{*}{\rotatebox{90}{2002}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.758 & ~
|
||||||
|
& \textit{hets} & 0.877 & ~
|
||||||
|
& \textbf{\textit{vsvr}} & 0.850 & ~
|
||||||
|
& \textbf{\textit{vrfr}} & 0.842 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.811 & ~
|
||||||
|
& \textit{hses} & 0.880 & ~
|
||||||
|
& \textit{hsma} & 0.851 & ~
|
||||||
|
& \textit{vsvr} & 0.849 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
We use this table to answer \textbf{Q1} regarding the overall best methods
|
||||||
|
under different ADDs.
|
||||||
|
All result tables in the main text report MASEs calculated with all time
|
||||||
|
steps of a day.
|
||||||
|
In contrast, \ref{peak_results} shows the same tables with MASEs calculated
|
||||||
|
with time steps within peak times only (i.e., lunch from 12 pm to 2 pm and
|
||||||
|
dinner from 6 pm to 8 pm).
|
||||||
|
The differences lie mainly in the decimals of the individual MASE
|
||||||
|
averages while the ranks of the forecasting methods do not change except
|
||||||
|
in rare cases.
|
||||||
|
That shows that the presented accuracies are driven by the forecasting methods'
|
||||||
|
accuracies at peak times.
|
||||||
|
Intuitively, they all correctly predict zero demand for non-peak times.
|
||||||
|
|
||||||
|
Unsurprisingly, the best model for pixels without demand (i.e.,
|
||||||
|
$0 < \text{ADD} < 2.5$) is \textit{trivial}.
|
||||||
|
Whereas \textit{hsma} also adapts well, its performance is worse.
|
||||||
|
None of the more sophisticated models reaches a similar accuracy.
|
||||||
|
The intuition behind is that \textit{trivial} is the least distorted by the
|
||||||
|
relatively large proportion of noise given the low-count nature of the
|
||||||
|
time series.
|
||||||
|
|
||||||
|
For low demand (i.e., $2.5 < \text{ADD} < 10$), there is also a clear
|
||||||
|
best-performing model, namely \textit{hsma}.
|
||||||
|
As the non-seasonal \textit{hses} reaches a similar accuracy as its
|
||||||
|
potentially seasonal generalization, the \textit{hets}, we conclude that
|
||||||
|
the seasonal pattern from weekdays is not yet strong enough to be
|
||||||
|
recognized in low demand pixels.
|
||||||
|
So, in the absence of seasonality, models that only model a trend part are
|
||||||
|
the least susceptible to the noise.
|
||||||
|
|
||||||
|
For medium demand (i.e., $10 < \text{ADD} < 25$) and training horizons up to
|
||||||
|
six weeks, the best-performing models are the same as for low demand.
|
||||||
|
For longer horizons, \textit{hets} provides the highest accuracy.
|
||||||
|
Thus, to fit a seasonal pattern, longer training horizons are needed.
|
||||||
|
While \textit{vsvr} enters the top three, \textit{hets} has the edge as they
|
||||||
|
neither require parameter tuning nor real-time data.
|
||||||
|
|
||||||
|
In summary, except for high demand, simple models trained on horizontal time
|
||||||
|
series work best.
|
||||||
|
By contrast, high demand (i.e., $25 < \text{ADD} < \infty$) and less than
|
||||||
|
six training weeks is the only situation where classical models trained on
|
||||||
|
vertical time series work well.
|
||||||
|
Then, \textit{rtarima} outperforms their siblings from Sub-sections
|
||||||
|
\ref{vert} and \ref{rt}.
|
||||||
|
We conjecture that intra-day auto-correlations as caused, for example, by
|
||||||
|
weather, are the reason for that.
|
||||||
|
Intuitively, a certain amount of demand (i.e., a high enough signal-to-noise
|
||||||
|
ratio) is required such that models with auto-correlations can see them
|
||||||
|
through all the noise.
|
||||||
|
That idea is supported by \textit{vrfr} reaching a similar accuracy under
|
||||||
|
high demand as their tree-structure allows them to fit auto-correlations.
|
||||||
|
As both \textit{rtarima} and \textit{vrfr} incorporate recent demand,
|
||||||
|
real-time information can indeed improve accuracy.
|
||||||
|
However, once models are trained on longer horizons, \textit{hets} is more
|
||||||
|
accurate than \textit{vrfr}.
|
||||||
|
Thus, to answer \textbf{Q4}, we conclude that real-time information only
|
||||||
|
improves accuracy if three or four weeks of training material are
|
||||||
|
available.
|
||||||
|
|
||||||
|
In addition to looking at the results in tables covering the entire one-year
|
||||||
|
horizon, we also created sub-analyses on the distinct seasons spring,
|
||||||
|
summer (incl. the long holiday season in France), and fall.
|
||||||
|
Yet, none of the results portrayed in this and the subsequent sections change
|
||||||
|
is significant ways.
|
||||||
|
We conjecture that there could be differences if the overall demand of the UDP
|
||||||
|
increased to a scale beyond the one this case study covers and leave that
|
||||||
|
up to a follow-up study with a bigger UDP.
|
31
tex/4_stu/5_training.tex
Normal file
31
tex/4_stu/5_training.tex
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
\subsection{Impact of the Training Horizon}
|
||||||
|
\label{training}
|
||||||
|
|
||||||
|
Whereas it is reasonable to assume that forecasts become more accurate as the
|
||||||
|
training horizon expands, our study reveals some interesting findings.
|
||||||
|
First, without demand, \textit{trivial} indeed performs better with more
|
||||||
|
training material, but improved pattern recognition cannot be the cause
|
||||||
|
here.
|
||||||
|
Instead, we argue that the reason for this is that the longer there has been
|
||||||
|
no steady demand, the higher the chance that this will not change soon.
|
||||||
|
Further, if we focus on shorter training horizons, the sample will necessarily
|
||||||
|
contain cases where a pixel is initiated after a popular-to-be restaurant
|
||||||
|
joined the platform:
|
||||||
|
Demand grows fast making \textit{trivial} less accurate, and the pixel moves
|
||||||
|
to another cluster soon.
|
||||||
|
|
||||||
|
Second, with low demand, the best-performing \textit{hsma} becomes less
|
||||||
|
accurate with more training material.
|
||||||
|
While one could argue that this is due to \textit{hsma} not fitting a trend,
|
||||||
|
the less accurate \textit{hses} and \textit{hets} do fit a trend.
|
||||||
|
Instead, we argue that any low-demand time series naturally exhibits a high
|
||||||
|
noise-to-signal ratio, and \textit{hsma} is the least susceptible to
|
||||||
|
noise.
|
||||||
|
Then, to counter the missing trend term, the training horizon must be shorter.
|
||||||
|
|
||||||
|
With medium demand, a similar argument can be made; however, the
|
||||||
|
signal already becomes more apparent favoring \textit{hets} with more
|
||||||
|
training data.
|
||||||
|
|
||||||
|
Lastly, with high demand, the signal becomes so clear that more sophisticated
|
||||||
|
models can exploit longer training horizons.
|
162
tex/4_stu/6_fams.tex
Normal file
162
tex/4_stu/6_fams.tex
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
\subsection{Results by Model Families}
|
||||||
|
\label{fams}
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of benchmark and horizontal models
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:hori}
|
||||||
|
\begin{tabular}{|c|ccc|cccccccc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\
|
||||||
|
\cline{2-12}
|
||||||
|
~ & \textit{naive} & \textit{fnaive} & \textit{paive}
|
||||||
|
& \textit{harima} & \textit{hcroston} & \textit{hets} & \textit{hholt}
|
||||||
|
& \textit{hhwinters} & \textit{hses} & \textit{hsma} & \textit{htheta} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 11 & 7 (2) & 8 (5) & 5 (7) & 4 & 3
|
||||||
|
& 9 (10) & 10 (9) & 2 (6) & 1 & 6 (8) \\
|
||||||
|
4 & 11 & 7 (2) & 8 (3) & 5 (6) & 4 (5) & 3 (1)
|
||||||
|
& 9 (10) & 10 (9) & 2 (7) & 1 (4) & 6 (8) \\
|
||||||
|
5 & 11 & 7 (2) & 8 (4) & 5 (3) & 4 (9) & 3 (1)
|
||||||
|
& 9 (10) & 10 (5) & 2 (8) & 1 (6) & 6 (7) \\
|
||||||
|
6 & 11 & 8 (5) & 9 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 10 & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
7 & 11 & 8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 9 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
8 & 11 & 9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 8 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
Besides the overall results, we provide an in-depth comparison of models
|
||||||
|
within a family.
|
||||||
|
Instead of reporting the MASE per model, we rank the models holding the
|
||||||
|
training horizon fixed to make comparison easier.
|
||||||
|
Table \ref{t:hori} presents the models trained on horizontal time series.
|
||||||
|
In addition to \textit{naive}, we include \textit{fnaive} and \textit{pnaive}
|
||||||
|
already here as more competitive benchmarks.
|
||||||
|
The tables in this section report two rankings simultaneously:
|
||||||
|
The first number is the rank resulting from lumping the low and medium
|
||||||
|
clusters together, which yields almost the same rankings when analyzed
|
||||||
|
individually.
|
||||||
|
The ranks from only high demand pixels are in parentheses if they differ.
|
||||||
|
|
||||||
|
A first insight is that \textit{fnaive} is the best benchmark in all
|
||||||
|
scenarios:
|
||||||
|
Decomposing flexibly by tuning the $ns$ parameter is worth the computational
|
||||||
|
cost.
|
||||||
|
Further, if one is limited in the number of non-na\"{i}ve methods,
|
||||||
|
\textit{hets} is the best compromise and works well across all demand
|
||||||
|
levels.
|
||||||
|
It is also the best model independent of the training horizon for high demand.
|
||||||
|
With low or medium demand, \textit{hsma} is the clear overall winner; yet,
|
||||||
|
with high demand, models with a seasonal fit (i.e., \textit{harima},
|
||||||
|
\textit{hets}, and \textit{hhwinters}) are more accurate, in particular,
|
||||||
|
for longer training horizons.
|
||||||
|
This is due to demand patterns in the weekdays becoming stronger with higher
|
||||||
|
overall demand.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of classical models on vertical time series
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:vert}
|
||||||
|
\begin{tabular}{|c|cc|ccccc|ccccc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{2}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}}
|
||||||
|
& \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\
|
||||||
|
\cline{2-13}
|
||||||
|
~ & \textit{hets} & \textit{hsma} & \textit{varima} & \textit{vets}
|
||||||
|
& \textit{vholt} & \textit{vses} & \textit{vtheta} & \textit{rtarima}
|
||||||
|
& \textit{rtets} & \textit{rtholt} & \textit{rtses} & \textit{rttheta} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 2 (10) & 1 (7) & 6 (4) & 8 (6) & 10 (9)
|
||||||
|
& 7 (5) & 11 (12) & 4 (1) & 5 (3) & 9 (8) & 3 (2) & 12 (11) \\
|
||||||
|
4 & 2 (8) & 1 (10) & 6 (4) & 8 (6) & 10 (9)
|
||||||
|
& 7 (5) & 12 (11) & 3 (1) & 5 (3) & 9 (7) & 4 (2) & 11 (12) \\
|
||||||
|
5 & 2 (3) & 1 (10) & 7 (5) & 8 (7) & 10 (9)
|
||||||
|
& 6 & 11 & 4 (1) & 5 (4) & 9 (8) & 3 (2) & 12 \\
|
||||||
|
6 & 2 (1) & 1 (10) & 6 (5) & 8 (7) & 10 (9)
|
||||||
|
& 7 (6) & 11 (12) & 3 (2) & 5 (4) & 9 (8) & 4 (3) & 12 (11) \\
|
||||||
|
7 & 2 (1) & 1 (10) & 8 (5) & 7 & 10 (9)
|
||||||
|
& 6 & 11 (12) & 5 (2) & 4 & 9 (8) & 3 & 12 (11) \\
|
||||||
|
8 & 2 (1) & 1 (9) & 8 (5) & 7 (6) & 10 (8)
|
||||||
|
& 6 & 12 (10) & 5 (2) & 4 & 9 (7) & 3 & 11 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
Table \ref{t:vert} extends the previous analysis to classical models trained
|
||||||
|
on vertical time series.
|
||||||
|
Now, the winners from before, \textit{hets} and \textit{hsma}, serve as
|
||||||
|
benchmarks.
|
||||||
|
Whereas for low and medium demand, no improvements can be obtained,
|
||||||
|
\textit{rtarima} and \textit{rtses} are the most accurate with high demand
|
||||||
|
and short training horizons.
|
||||||
|
For six or more training weeks, \textit{hets} is still optimal.
|
||||||
|
Independent of retraining and the demand level, the models' relative
|
||||||
|
performances are consistent:
|
||||||
|
The \textit{*arima} and \textit{*ses} models are best, followed by
|
||||||
|
\textit{*ets}, \textit{*holt}, and \textit{*theta}.
|
||||||
|
Thus, models that can deal with auto-correlations and short-term forecasting
|
||||||
|
errors, as expressed by moving averages, and that cannot be distracted by
|
||||||
|
trend terms are optimal for vertical series.
|
||||||
|
|
||||||
|
Finally, Table \ref{t:ml} compares the two ML-based models against the
|
||||||
|
best-performing classical models and answers \textbf{Q2}:
|
||||||
|
With low and medium demand, no improvements can be obtained again; however,
|
||||||
|
with high demand, \textit{vrfr} has the edge over \textit{rtarima} for
|
||||||
|
training horizons up to six weeks.
|
||||||
|
We conjecture that \textit{vrfr} fits auto-correlations better than
|
||||||
|
\textit{varima} and is not distracted by short-term noise as
|
||||||
|
\textit{rtarima} may be due to the retraining.
|
||||||
|
With seven or eight training weeks, \textit{hets} remains the overall winner.
|
||||||
|
Interestingly, \textit{vsvr} is more accurate than \textit{vrfr} for low and
|
||||||
|
medium demand.
|
||||||
|
We assume that \textit{vrfr} performs well only with strong auto-correlations,
|
||||||
|
which are not present with low and medium demand.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of ML models on vertical time series
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:ml}
|
||||||
|
\begin{tabular}{|c|cccc|cc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{4}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{2}{c|}{\thead{ML}} \\
|
||||||
|
\cline{2-7}
|
||||||
|
~ & \textit{fnaive} & \textit{hets} & \textit{hsma}
|
||||||
|
& \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 6 & 2 (5) & 1 (4) & 3 (1) & 5 (2) & 4 (3) \\
|
||||||
|
4 & 6 (5) & 2 (4) & 1 (6) & 3 (2) & 5 (1) & 4 (3) \\
|
||||||
|
5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\
|
||||||
|
6 & 6 (5) & 2 & 1 (6) & 4 & 5 (1) & 3 \\
|
||||||
|
7 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
|
||||||
|
8 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
Analogously, we created tables like Table \ref{t:hori} to \ref{t:ml} for the
|
||||||
|
forecasts with time steps of 90 and 120 minutes and find that the relative
|
||||||
|
rankings do not change significantly.
|
||||||
|
The same holds true for the rankings with changing pixel sizes.
|
||||||
|
For conciseness reasons, we do not include these additional tables in this
|
||||||
|
article.
|
||||||
|
In summary, the relative performances exhibited by certain model families
|
||||||
|
are shown to be rather stable in this case study.
|
27
tex/4_stu/7_pixels_intervals.tex
Normal file
27
tex/4_stu/7_pixels_intervals.tex
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
\subsection{Effects of the Pixel Size and Time Step Length}
|
||||||
|
\label{pixels_intervals}
|
||||||
|
|
||||||
|
As elaborated in Sub-section \ref{grid}, more order aggregation leads to a
|
||||||
|
higher overall demand level and an improved pattern recognition in the
|
||||||
|
generated time series.
|
||||||
|
Consequently, individual cases tend to move to the right in tables equivalent
|
||||||
|
to Table \ref{t:results}.
|
||||||
|
With the same $ADD$ clusters, forecasts for pixel sizes of $2~\text{km}^2$ and
|
||||||
|
$4~\text{km}^2$ or time intervals of 90 and 120 minutes or combinations
|
||||||
|
thereof yield results similar to the best models as revealed in Tables
|
||||||
|
\ref{t:results}, \ref{t:hori}, \ref{t:vert}, and \ref{t:ml} for high
|
||||||
|
demand.
|
||||||
|
By contrast, forecasts for $0.5~\text{km}^2$ pixels have most of the cases
|
||||||
|
(i.e., $n$) in the no or low demand clusters.
|
||||||
|
In that case, the pixels are too small, and pattern recognition becomes
|
||||||
|
harder.
|
||||||
|
While it is true, that \textit{trivial} exhibits the overall lowest MASE
|
||||||
|
for no demand cases, these forecasts become effectively worthless for
|
||||||
|
operations.
|
||||||
|
In the extreme, with even smaller pixels we would be forecasting $0$ orders
|
||||||
|
in all pixels for all time steps.
|
||||||
|
In summary, the best model and its accuracy are determined primarily by the
|
||||||
|
$ADD$, and the pixel size and interval length are merely parameters to
|
||||||
|
control that.
|
||||||
|
The forecaster's goal is to create a grid with small enough pixels without
|
||||||
|
losing a recognizable pattern.
|
6
tex/5_con/1_intro.tex
Normal file
6
tex/5_con/1_intro.tex
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
\section{Conclusion}
|
||||||
|
\label{con}
|
||||||
|
|
||||||
|
We conclude this paper by elaborating on how the findings are transferable
|
||||||
|
to similar settings, providing some implications for a UDP's
|
||||||
|
managers, and discussing further research opportunities.
|
23
tex/5_con/2_generalizability.tex
Normal file
23
tex/5_con/2_generalizability.tex
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
\subsection{Generalizability of the Methology and Findings}
|
||||||
|
\label{generalizability}
|
||||||
|
|
||||||
|
Whereas forecasting applications are always data-specific, the following
|
||||||
|
aspects generalize to UDPs with ad-hoc transportation services:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Double Seasonality}:
|
||||||
|
The double seasonality causes a periodicity $k$ too large to be modeled by
|
||||||
|
classical models, and we adapt the STL method in the \textit{fnaive} model
|
||||||
|
such that it "flexibly" fits a seasonal pattern changing in a non-trivial
|
||||||
|
way over time.
|
||||||
|
\item \textbf{Order Sparsity}:
|
||||||
|
The intermittent time series resulting from gridification require simple
|
||||||
|
methods like \textit{hsma} or \textit{trivial} that are not as susceptible
|
||||||
|
to noise as more sophisticated ones.
|
||||||
|
\item \textbf{Unified CV}:
|
||||||
|
A CV unified around a whole day allows evaluating classical statistical and ML
|
||||||
|
methods on the same scale.
|
||||||
|
It is agnostic of both the type of the time series and retraining.
|
||||||
|
\item \textbf{Error Measure}:
|
||||||
|
Analogous to \cite{hyndman2006}, we emphasize the importance of choosing a
|
||||||
|
consistent error measure, and argue for increased use of MASE.
|
||||||
|
\end{itemize}
|
61
tex/5_con/3_implications.tex
Normal file
61
tex/5_con/3_implications.tex
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
\subsection{Managerial Implications}
|
||||||
|
\label{implications}
|
||||||
|
|
||||||
|
Even though zeitgeist claims that having more data is always better, our study
|
||||||
|
shows this is not the case here:
|
||||||
|
First, under certain circumstances, accuracy may go up with shorter training
|
||||||
|
horizons.
|
||||||
|
Second, none of the external data sources improves the accuracies.
|
||||||
|
Somewhat surprisingly, despite ML-based methods` popularity in both business
|
||||||
|
and academia in recent years, we must conclude that classical forecasting
|
||||||
|
methods suffice to reach the best accuracy in our study.
|
||||||
|
There is one case where ML-based methods are competitive in our case study:
|
||||||
|
In a high demand pixel (defined as more than 25 orders per day on average),
|
||||||
|
if only about four to six weeks of past data is available,
|
||||||
|
the \textit{vrfr} model outperformed the classical ones.
|
||||||
|
So, we recommend trying out ML-based methods in such scenarios.
|
||||||
|
In addition, with the \textit{hsma} and \textit{hets} models being the overall
|
||||||
|
winners, incorporating real-time data is not beneficial, in particular,
|
||||||
|
with more than six weeks of training material available.
|
||||||
|
Lastly, with just \textit{hets}, that exhibits an accuracy comparable to
|
||||||
|
\textit{hsma} for low and medium demand, our industry partner can likely
|
||||||
|
schedule its shifts on an hourly basis one week in advance.
|
||||||
|
|
||||||
|
This study gives rise to the following managerial implications.
|
||||||
|
First, UDPs can implement readily available forecasting algorithms with limited
|
||||||
|
effort.
|
||||||
|
This, however, requires purposeful data collection and preparation by those
|
||||||
|
companies, which, according to our study, is at least equally important as
|
||||||
|
the selection of the forecasting algorithm, as becomes clear from
|
||||||
|
investigating the impact of the length of the training horizon.
|
||||||
|
Second, the benefits of moving from manual forecasting to automated forecasting
|
||||||
|
include being able to pursue a predictive routing strategy and
|
||||||
|
demand-adjusted shift scheduling.
|
||||||
|
At the time the case study data was collected, our industry partner did not
|
||||||
|
conduct any forecasting; the only forecasting-related activities were the
|
||||||
|
shift managers scheduling the shifts one week in advance manually in
|
||||||
|
spreadsheets.
|
||||||
|
Thus, selecting the right forecasting algorithm according to the framework
|
||||||
|
proposed in this study becomes a prerequisite to the much needed
|
||||||
|
operational improvements UDPs need to achieve in their quest for
|
||||||
|
profitability.
|
||||||
|
In general, many UDPs launched in recent years are venture capital backed
|
||||||
|
start-up companies that almost by definition do not have a strong
|
||||||
|
grounding in operational excellence, and publications such as the ones by
|
||||||
|
Uber are the exception rather than the rule.
|
||||||
|
Our paper shows that forecasting the next couple of hours can already be
|
||||||
|
implemented within the first year of a UDP's operations.
|
||||||
|
Even if such forecasts could not be exploited by predictive routing (e.g., due
|
||||||
|
to prolonged waiting times at restaurants), they would help monitoring the
|
||||||
|
operations for exceptional events.
|
||||||
|
Additionally, the shift planning may be automated saving as much as one shift
|
||||||
|
manager per city.
|
||||||
|
We emphasize that for the most part, our proposed forecasting system
|
||||||
|
is calibrated automatically and no manual work by a data scientist is required.
|
||||||
|
The only two parameters where assumptions need to be made are the pixel size
|
||||||
|
and the time step.
|
||||||
|
The results in our empirical study suggest
|
||||||
|
that a pixel size of $1~\text{km}^2$ and a time step of one hour are ideal,
|
||||||
|
which results in the optimal trade-off
|
||||||
|
between signal strength and spatial-temporal resolution.
|
||||||
|
Future research may explore adaptive grid-sizing depending on, for instance, demand density.
|
45
tex/5_con/4_further_research.tex
Normal file
45
tex/5_con/4_further_research.tex
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
\subsection{Further Research}
|
||||||
|
\label{further_research}
|
||||||
|
|
||||||
|
Sub-sections \ref{overall_results} and \ref{fams} present the models' average
|
||||||
|
performance.
|
||||||
|
We did not research what is the best model in a given pixel on a given day.
|
||||||
|
To answer this, a study finding an optimal number of outer validation days is
|
||||||
|
neccessary.
|
||||||
|
With the varying effect of the training horizon, this model selection is a
|
||||||
|
two-dimensional grid search that is prone to overfitting due to the high
|
||||||
|
noise in low count data.
|
||||||
|
Except heuristics relating the ADD to the training horizon, we cannot say
|
||||||
|
anything about that based on our study.
|
||||||
|
\cite{lemke2010} and \cite{wang2009} show how, for example, a time series'
|
||||||
|
characteristics may be used to select models.
|
||||||
|
Thus, we suggest conducting more detailed analyses on how to incorporate model
|
||||||
|
selection into our proposed forecasting system.
|
||||||
|
|
||||||
|
Future research should also integrate our forecasting system into a predictive
|
||||||
|
routing application and evaluate its business impact.
|
||||||
|
This embeds our research into the vast literature on the VRP.
|
||||||
|
Initially introduced by \cite{dantzig1959}, \gls{vrp}s are concerned with
|
||||||
|
finding optimal routes serving customers.
|
||||||
|
We refer to \cite{toth2014} for a comprehensive overview.
|
||||||
|
The two variants relevant for the UDP case are the dynamic VRP and
|
||||||
|
the pickup and delivery problem (\gls{pdp}).
|
||||||
|
A VRP is dynamic if the data to solve a problem only becomes available
|
||||||
|
as the operations are underway.
|
||||||
|
\cite{thomas2010}, \cite{pillac2013}, and \cite{psaraftis2016} describe how
|
||||||
|
technological advances, in particular, mobile technologies, have led to a
|
||||||
|
renewed interest in research on dynamic VRPs, and
|
||||||
|
\cite{berbeglia2010} provide a general overview.
|
||||||
|
\cite{ichoua2006} and \cite{ferrucci2013} provide solution methods for
|
||||||
|
simulation studies where they assume stochastic customer demand based on
|
||||||
|
historical distributions.
|
||||||
|
In both studies, dummy demand nodes are inserted into the VRP instance.
|
||||||
|
Forecasts by our system extend this idea naturally as dummy nodes could be
|
||||||
|
derived from point forecasts as well.
|
||||||
|
The concrete case of a meal delivering UDP is contained in a recent
|
||||||
|
literature stream started by \cite{ulmer2017} and extended by
|
||||||
|
\cite{reyes2018} and \cite{yildiz2018}: They coin the term meal delivery
|
||||||
|
routing problem (\gls{mdrp}).
|
||||||
|
The MDRP is a special case of the dynamic PDP where the defining
|
||||||
|
characteristic is that once a vehicle is scheduled, a modification of the
|
||||||
|
route is inadmissible.
|
54
tex/apx/case_study.tex
Normal file
54
tex/apx/case_study.tex
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
\section{Raw Order Data in the Case Study}
|
||||||
|
\label{dataset}
|
||||||
|
|
||||||
|
The raw data for the empirical study in Section \ref{stu} was provided by a
|
||||||
|
meal delivery platform operating in five cities in France in 2016.
|
||||||
|
The platform received a total of 686,385 orders distributed as follows:
|
||||||
|
|
||||||
|
\
|
||||||
|
\begin{center}
|
||||||
|
\begin{tabular}{llr}
|
||||||
|
\hline
|
||||||
|
\thead{City} & \thead{Launch Day} & \thead{Orders} \\
|
||||||
|
\hline
|
||||||
|
Bordeaux & July 18 & 64,012 \\
|
||||||
|
Lille & October 30 & 14,362 \\
|
||||||
|
Lyon & February 21 & 214,635 \\
|
||||||
|
Nantes & October 31 & 12,900 \\
|
||||||
|
Paris & March 7 & 380,476 \\
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
The part of the database relevant for forecasting can be thought of as one
|
||||||
|
table per city, where each row represents one order and consists of the
|
||||||
|
following groups of columns:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Restaurant Data}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item unique ID and name
|
||||||
|
\item pickup location as latitude-longitude pair
|
||||||
|
\end{enumerate}
|
||||||
|
\item \textbf{Customer Data}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item unique ID, name, and phone number
|
||||||
|
\item delivery location as latitude-longitude pair (mostly physical
|
||||||
|
addresses but also public spots)
|
||||||
|
\end{enumerate}
|
||||||
|
\item \textbf{Timestamps}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item placement via the smartphone app
|
||||||
|
\item fulfillment workflow (pickup, delivery, cancellation, re-deliveries)
|
||||||
|
\end{enumerate}
|
||||||
|
\item \textbf{Courier Data}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item unique ID, name, and phone number
|
||||||
|
\item shift data (begin, breaks, end)
|
||||||
|
\item average speed
|
||||||
|
\end{enumerate}
|
||||||
|
\item \textbf{Order Details}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item meals and drinks
|
||||||
|
\item prices and discounts granted
|
||||||
|
\end{enumerate}
|
||||||
|
\end{enumerate}
|
121
tex/apx/enhanced_feats.tex
Normal file
121
tex/apx/enhanced_feats.tex
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
\section{Enhancing Forecasting Models with External Data}
|
||||||
|
\label{enhanced_feats}
|
||||||
|
|
||||||
|
In this appendix, we show how the feature matrix in Sub-section
|
||||||
|
\ref{ml_models} can be extended with features other than historical order
|
||||||
|
data.
|
||||||
|
Then, we provide an overview of what external data we tried out as predictors
|
||||||
|
in our empirical study.
|
||||||
|
|
||||||
|
\subsection{Enhanced Feature Matrices}
|
||||||
|
|
||||||
|
Feature matrices can naturally be extended by appending new feature columns
|
||||||
|
$x_{t,f}$ or $x_f$ on the right where the former represent predictors
|
||||||
|
changing throughout a day and the latter being static either within a
|
||||||
|
pixel or across a city.
|
||||||
|
$f$ refers to an external predictor variable, such as one of the examples
|
||||||
|
listed below.
|
||||||
|
In the SVR case, the columns should be standardized before fitting as external
|
||||||
|
predictors are most likely on a different scale than the historic order
|
||||||
|
data.
|
||||||
|
Thus, for a matrix with seasonally-adjusted order data $a_t$ in it, an
|
||||||
|
enhanced matrix looks as follows:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\vec{y}
|
||||||
|
=
|
||||||
|
\begin{pmatrix}
|
||||||
|
a_T \\
|
||||||
|
a_{T-1} \\
|
||||||
|
\dots \\
|
||||||
|
a_{H+1}
|
||||||
|
\end{pmatrix}
|
||||||
|
~~~~~
|
||||||
|
\mat{X}
|
||||||
|
=
|
||||||
|
\begin{bmatrix}
|
||||||
|
a_{T-1} & a_{T-2} & \dots & a_{T-H} & ~~~
|
||||||
|
& x_{T,A} & \dots & x_{B} & \dots \\
|
||||||
|
a_{T-2} & a_{T-3} & \dots & a_{T-(H+1)} & ~~~
|
||||||
|
& x_{T-1,A} & \dots & x_{B} & \dots \\
|
||||||
|
\dots & \dots & \dots & \dots & ~~~
|
||||||
|
& \dots & \dots & \dots & \dots \\
|
||||||
|
a_H & a_{H-1} & \dots & a_1 & ~~~
|
||||||
|
& x_{H+1,A} & \dots & x_{B} & \dots
|
||||||
|
\end{bmatrix}
|
||||||
|
$$
|
||||||
|
\
|
||||||
|
|
||||||
|
Similarly, we can also enhance the tabular matrices from
|
||||||
|
\ref{tabular_ml_models}.
|
||||||
|
The same comments as for their pure equivalents in Sub-section \ref{ml_models}
|
||||||
|
apply, in particular, that ML models trained with an enhanced matrix can
|
||||||
|
process real-time data without being retrained.
|
||||||
|
|
||||||
|
\subsection{External Data in the Empirical Study}
|
||||||
|
\label{external_data}
|
||||||
|
|
||||||
|
In the empirical study, we tested four groups of external features that we
|
||||||
|
briefly describe here.
|
||||||
|
|
||||||
|
\vskip 0.1in
|
||||||
|
|
||||||
|
\textbf{Calendar Features}:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Time of day (as synthesized integers: e.g., 1,050 for 10:30 am,
|
||||||
|
or 1,600 for 4 pm)
|
||||||
|
\item Day of week (as one-hot encoded booleans)
|
||||||
|
\item Work day or not (as booleans)
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\vskip 0.1in
|
||||||
|
|
||||||
|
\textbf{Features derived from the historical Order Data}:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Number of pre-orders for a time step (as integers)
|
||||||
|
\item 7-day SMA of the percentages of discounted orders (as percentages):
|
||||||
|
The platform is known for running marketing campaigns aimed at
|
||||||
|
first-time customers at irregular intervals. Consequently, the
|
||||||
|
order data show a wave-like pattern of coupons redeemed when looking
|
||||||
|
at the relative share of discounted orders per day.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\vskip 0.1in
|
||||||
|
|
||||||
|
\textbf{Neighborhood Features}:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Ambient population (as integers) as obtained from the ORNL LandScan
|
||||||
|
database
|
||||||
|
\item Number of active platform restaurants (as integers)
|
||||||
|
\item Number of overall restaurants, food outlets, retailers, and other
|
||||||
|
businesses (as integers) as obtained from the Google Maps and Yelp
|
||||||
|
web services
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\vskip 0.1in
|
||||||
|
|
||||||
|
\textbf{Real-time Weather} (raw data obtained from IBM's
|
||||||
|
Wunderground database):
|
||||||
|
\begin{itemize}
|
||||||
|
\item Absolute temperature, wind speed, and humidity
|
||||||
|
(as decimals and percentages)
|
||||||
|
\item Relative temperature with respect to 3-day and 7-day historical
|
||||||
|
means (as decimals)
|
||||||
|
\item Day vs. night defined by sunset (as booleans)
|
||||||
|
\item Summarized description (as indicators $-1$, $0$, and $+1$)
|
||||||
|
\item Lags of the absolute temperature and the summaries covering the
|
||||||
|
previous three hours
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\vskip 0.1in
|
||||||
|
|
||||||
|
Unfortunately, we must report that none of the mentioned external data
|
||||||
|
improved the accuracy of the forecasts.
|
||||||
|
Some led to models overfitting the data, which could not be regulated.
|
||||||
|
Manual tests revealed that real-time weather data are the most promising
|
||||||
|
external source.
|
||||||
|
Nevertheless, the data provided by IBM's Wunderground database originate from
|
||||||
|
weather stations close to airports, which implies that we only have the
|
||||||
|
same aggregate weather data for the entire city.
|
||||||
|
If weather data is available on a more granular basis in the future, we see
|
||||||
|
some potential for exploitation.
|
144
tex/apx/glossary.tex
Normal file
144
tex/apx/glossary.tex
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
\section{Glossary}
|
||||||
|
\label{glossary}
|
||||||
|
|
||||||
|
% Abbreviations for technical terms.
|
||||||
|
\newglossaryentry{add}{
|
||||||
|
name=ADD, description={Average Daily Demand}
|
||||||
|
}
|
||||||
|
\newglossaryentry{cart}{
|
||||||
|
name=CART, description={Classification and Regression Trees}
|
||||||
|
}
|
||||||
|
\newglossaryentry{cv}{
|
||||||
|
name=CV, description={Cross Validation}
|
||||||
|
}
|
||||||
|
\newglossaryentry{mase}{
|
||||||
|
name=MASE, description={Mean Absolute Scaled Error}
|
||||||
|
}
|
||||||
|
\newglossaryentry{mdrp}{
|
||||||
|
name=MDRP, description={Meal Delivery Routing Proplem}
|
||||||
|
}
|
||||||
|
\newglossaryentry{ml}{
|
||||||
|
name=ML, description={Machine Learning}
|
||||||
|
}
|
||||||
|
\newglossaryentry{pdp}{
|
||||||
|
name=PDP, description={Pickup and Delivery Problem}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rf}{
|
||||||
|
name=RF, description={Random Forest}
|
||||||
|
}
|
||||||
|
\newglossaryentry{stl}{
|
||||||
|
name=STL, description={Seasonal and Trend Decomposition using Loess}
|
||||||
|
}
|
||||||
|
\newglossaryentry{svm}{
|
||||||
|
name=SVM, description={Support Vector Machine}
|
||||||
|
}
|
||||||
|
\newglossaryentry{svr}{
|
||||||
|
name=SVR, description={Support Vector Regression}
|
||||||
|
}
|
||||||
|
\newglossaryentry{udp}{
|
||||||
|
name=UDP, description={Urban Delivery Platform}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vrp}{
|
||||||
|
name=VRP, description={Vehicle Routing Problem}
|
||||||
|
}
|
||||||
|
|
||||||
|
% Model names.
|
||||||
|
\newglossaryentry{naive}{
|
||||||
|
name=naive, description={(Seasonal) Na\"{i}ve Method}
|
||||||
|
}
|
||||||
|
\newglossaryentry{fnaive}{
|
||||||
|
name=fnaive, description={"Flexible" STL Decomposition,
|
||||||
|
with tuned ns parameter}
|
||||||
|
}
|
||||||
|
\newglossaryentry{pnaive}{
|
||||||
|
name=pnaive, description={"Periodic" STL Decomposition,
|
||||||
|
with ns parameter set to large number}
|
||||||
|
}
|
||||||
|
\newglossaryentry{trivial}{
|
||||||
|
name=trivial, description={Trivial Method}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hcroston}{
|
||||||
|
name=hcroston, description={Croston's Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hholt}{
|
||||||
|
name=hholt, description={Holt's Linear Trend Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vholt}{
|
||||||
|
name=vholt, description={Holt's Linear Trend Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rtholt}{
|
||||||
|
name=rtholt, description={Holt's Linear Trend Method,
|
||||||
|
(re)trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hhwinters}{
|
||||||
|
name=hhwinters, description={Holt-Winter's Seasonal Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hses}{
|
||||||
|
name=hses, description={Simple Exponential Smoothing Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vses}{
|
||||||
|
name=vses, description={Simple Exponential Smoothing Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rtses}{
|
||||||
|
name=rtses, description={Simple Exponential Smoothing Method,
|
||||||
|
(re)trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hsma}{
|
||||||
|
name=hsma, description={Simple Moving Average Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{htheta}{
|
||||||
|
name=htheta, description={Theta Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vtheta}{
|
||||||
|
name=vtheta, description={Theta Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rttheta}{
|
||||||
|
name=rttheta, description={Theta Method,
|
||||||
|
(re)trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{hets}{
|
||||||
|
name=hets, description={ETS State Space Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vets}{
|
||||||
|
name=vets, description={ETS State Space Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rtets}{
|
||||||
|
name=rtets, description={ETS State Space Method,
|
||||||
|
(re)trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{harima}{
|
||||||
|
name=harima, description={Autoregressive Integrated Moving Average
|
||||||
|
Method,
|
||||||
|
trained on horizontal time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{varima}{
|
||||||
|
name=varima, description={Autoregressive Integrated Moving Average
|
||||||
|
Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{rtarima}{
|
||||||
|
name=rtarima, description={Autoregressive Integrated Moving Average
|
||||||
|
Method,
|
||||||
|
(re)trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vrfr}{
|
||||||
|
name=vrfr, description={Random Forest Regression Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
\newglossaryentry{vsvr}{
|
||||||
|
name=vsvr, description={Support Vector Regression Method,
|
||||||
|
trained on vertical time series}
|
||||||
|
}
|
||||||
|
|
||||||
|
\printglossary[title=]
|
258
tex/apx/peak_results.tex
Normal file
258
tex/apx/peak_results.tex
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
\section{Forecasting Accuracies during Peak Times}
|
||||||
|
\label{peak_results}
|
||||||
|
|
||||||
|
This appendix shows all tables from the main text
|
||||||
|
with the MASE averages calculated from time steps within peak times
|
||||||
|
that are defined to be from 12 pm to 2 pm (=lunch) or from 6 pm to 8 pm (=dinner).
|
||||||
|
While the exact decimals of the MASEs differ,
|
||||||
|
the relative ranks of the forecasting methods are the same except in rare cases.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Top-3 models by training weeks and average demand
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps)}
|
||||||
|
\label{t:results:a}
|
||||||
|
\begin{tabular}{|c|c|*{12}{c|}}
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{\rotatebox{90}{\thead{Training}}}
|
||||||
|
& \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{No Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Low Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Medium Demand}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{High Demand}} \\
|
||||||
|
~ & ~
|
||||||
|
& \multicolumn{3}{c|}{(0 - 2.5)}
|
||||||
|
& \multicolumn{3}{c|}{(2.5 - 10)}
|
||||||
|
& \multicolumn{3}{c|}{(10 - 25)}
|
||||||
|
& \multicolumn{3}{c|}{(25 - $\infty$)} \\
|
||||||
|
\cline{3-14}
|
||||||
|
~ & ~
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$
|
||||||
|
& Method & MASE & $n$ \\
|
||||||
|
|
||||||
|
\hline \hline
|
||||||
|
\multirow{3}{*}{3} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.794 & \multirow{3}{*}{\rotatebox{90}{4586}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.817 & \multirow{3}{*}{\rotatebox{90}{2975}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.838 & \multirow{3}{*}{\rotatebox{90}{2743}}
|
||||||
|
& \textbf{\textit{rtarima}}
|
||||||
|
& 0.871 & \multirow{3}{*}{\rotatebox{90}{2018}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.808 & ~
|
||||||
|
& \textit{hses} & 0.847 & ~
|
||||||
|
& \textit{hses} & 0.851 & ~
|
||||||
|
& \textit{rtses} & 0.872 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.938 & ~
|
||||||
|
& \textit{hets} & 0.848 & ~
|
||||||
|
& \textit{hets} & 0.853 & ~
|
||||||
|
& \textit{rtets} & 0.874 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{4} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.791 & \multirow{3}{*}{\rotatebox{90}{4532}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.833 & \multirow{3}{*}{\rotatebox{90}{3033}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.839 & \multirow{3}{*}{\rotatebox{90}{2687}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.848 & \multirow{3}{*}{\rotatebox{90}{2016}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.794 & ~
|
||||||
|
& \textit{hses} & 0.838 & ~
|
||||||
|
& \textit{hses} & 0.847 & ~
|
||||||
|
& \textbf{\textit{rtarima}} & 0.851 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.907 & ~
|
||||||
|
& \textit{hets} & 0.841 & ~
|
||||||
|
& \textit{hets} & 0.851 & ~
|
||||||
|
& \textit{rtses} & 0.857 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{5} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.782 & \multirow{3}{*}{\rotatebox{90}{4527}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.844 & \multirow{3}{*}{\rotatebox{90}{3055}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.841 & \multirow{3}{*}{\rotatebox{90}{2662}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.849 & \multirow{3}{*}{\rotatebox{90}{2019}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.802 & ~
|
||||||
|
& \textit{hses} & 0.851 & ~
|
||||||
|
& \textit{hets} & 0.844 & ~
|
||||||
|
& \textbf{\textit{rtarima}} & 0.851 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.888 & ~
|
||||||
|
& \textit{hets} & 0.863 & ~
|
||||||
|
& \textit{hses} & 0.845 & ~
|
||||||
|
& \textit{vsvr} & 0.853 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{6} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.743 & \multirow{3}{*}{\rotatebox{90}{4470}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.843 & \multirow{3}{*}{\rotatebox{90}{3086}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.841 & \multirow{3}{*}{\rotatebox{90}{2625}}
|
||||||
|
& \textbf{\textit{vrfr}}
|
||||||
|
& 0.844 & \multirow{3}{*}{\rotatebox{90}{2025}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.765 & ~
|
||||||
|
& \textit{hses} & 0.853 & ~
|
||||||
|
& \textit{hses} & 0.844 & ~
|
||||||
|
& \textbf{\textit{hets}} & 0.847 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.836 & ~
|
||||||
|
& \textit{hets} & 0.861 & ~
|
||||||
|
& \textit{hets} & 0.844 & ~
|
||||||
|
& \textit{vsvr} & 0.849 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{7} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.728 & \multirow{3}{*}{\rotatebox{90}{4454}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.855 & \multirow{3}{*}{\rotatebox{90}{3132}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.843 & \multirow{3}{*}{\rotatebox{90}{2597}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.839 & \multirow{3}{*}{\rotatebox{90}{2007}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.744 & ~
|
||||||
|
& \textit{hses} & 0.862 & ~
|
||||||
|
& \textit{hsma} & 0.845 & ~
|
||||||
|
& \textbf{\textit{vrfr}} & 0.842 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.812 & ~
|
||||||
|
& \textit{hets} & 0.868 & ~
|
||||||
|
& \textbf{\textit{vsvr}} & 0.849 & ~
|
||||||
|
& \textit{vsvr} & 0.846 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\multirow{3}{*}{8} & 1
|
||||||
|
& \textbf{\textit{trivial}}
|
||||||
|
& 0.736 & \multirow{3}{*}{\rotatebox{90}{4402}}
|
||||||
|
& \textbf{\textit{hsma}}
|
||||||
|
& 0.865 & \multirow{3}{*}{\rotatebox{90}{3159}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.843 & \multirow{3}{*}{\rotatebox{90}{2575}}
|
||||||
|
& \textbf{\textit{hets}}
|
||||||
|
& 0.837 & \multirow{3}{*}{\rotatebox{90}{2002}} \\
|
||||||
|
~ & 2
|
||||||
|
& \textit{hsma} & 0.759 & ~
|
||||||
|
& \textit{hets} & 0.874 & ~
|
||||||
|
& \textbf{\textit{vsvr}} & 0.848 & ~
|
||||||
|
& \textbf{\textit{vrfr}} & 0.841 & ~ \\
|
||||||
|
~ & 3
|
||||||
|
& \textit{pnaive} & 0.820 & ~
|
||||||
|
& \textit{hses} & 0.879 & ~
|
||||||
|
& \textit{hsma} & 0.850 & ~
|
||||||
|
& \textit{vsvr} & 0.847 & ~ \\
|
||||||
|
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of benchmark and horizontal models
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:hori:a}
|
||||||
|
\begin{tabular}{|c|ccc|cccccccc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{3}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\
|
||||||
|
\cline{2-12}
|
||||||
|
~ & \textit{naive} & \textit{fnaive} & \textit{paive}
|
||||||
|
& \textit{harima} & \textit{hcroston} & \textit{hets} & \textit{hholt}
|
||||||
|
& \textit{hhwinters} & \textit{hses} & \textit{hsma} & \textit{htheta} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 11 & 7 (2) & 8 (5) & 5 (7) & 4 & 3
|
||||||
|
& 9 (10) & 10 (9) & 2 (6) & 1 & 6 (8) \\
|
||||||
|
4 & 11 & 7 (2) & 8 (3) & 5 (6) & 4 (5) & 3 (1)
|
||||||
|
& 9 (10) & 10 (9) & 2 (8) & 1 (4) & 6 (7) \\
|
||||||
|
5 & 11 & 7 (2) & 8 (4) & 5 (3) & 4 (9) & 3 (1)
|
||||||
|
& 9 (10) & 10 (5) & 2 (8) & 1 (6) & 6 (7) \\
|
||||||
|
6 & 11 & 8 (5) & 9 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 10 & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
7 & 11 & 8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 9 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
8 & 11 & 9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1)
|
||||||
|
& 8 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of classical models on vertical time series
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:vert:a}
|
||||||
|
\begin{tabular}{|c|cc|ccccc|ccccc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{2}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}}
|
||||||
|
& \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\
|
||||||
|
\cline{2-13}
|
||||||
|
~ & \textit{hets} & \textit{hsma} & \textit{varima} & \textit{vets}
|
||||||
|
& \textit{vholt} & \textit{vses} & \textit{vtheta} & \textit{rtarima}
|
||||||
|
& \textit{rtets} & \textit{rtholt} & \textit{rtses} & \textit{rttheta} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 2 (10) & 1 (7) & 6 (4) & 8 (6) & 10 (9)
|
||||||
|
& 7 (5) & 11 (12) & 4 (1) & 5 (3) & 9 (8) & 3 (2) & 12 (11) \\
|
||||||
|
4 & 2 (7) & 1 (10) & 6 (4) & 8 (6) & 10 (9)
|
||||||
|
& 7 (5) & 12 (11) & 3 (1) & 5 (3) & 9 (8) & 4 (2) & 11 (12) \\
|
||||||
|
5 & 2 (3) & 1 (10) & 7 (5) & 8 (7) & 10 (9)
|
||||||
|
& 6 & 11 & 4 (1) & 5 (4) & 9 (8) & 3 (2) & 12 \\
|
||||||
|
6 & 2 (1) & 1 (10) & 6 (5) & 8 (7) & 10 (9)
|
||||||
|
& 7 (6) & 11 (12) & 3 (2) & 5 (4) & 9 (8) & 4 (3) & 12 (11) \\
|
||||||
|
7 & 2 (1) & 1 (10) & 8 (5) & 7 & 10 (9)
|
||||||
|
& 6 & 11 (12) & 5 (2) & 4 & 9 (8) & 3 & 12 (11) \\
|
||||||
|
8 & 2 (1) & 1 (9) & 8 (5) & 7 & 10 (8)
|
||||||
|
& 6 & 12 (10) & 5 (2) & 4 & 9 (6) & 3 & 11 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
||||||
|
|
||||||
|
\pagebreak
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\captionof{table}{Ranking of ML models on vertical time series
|
||||||
|
($1~\text{km}^2$ pixel size, 60-minute time steps):
|
||||||
|
the table shows the ranks for cases with $2.5 < ADD < 25$
|
||||||
|
(and $25 < ADD < \infty$ in parentheses if they differ)}
|
||||||
|
\label{t:ml:a}
|
||||||
|
\begin{tabular}{|c|cccc|cc|}
|
||||||
|
\hline
|
||||||
|
\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}}
|
||||||
|
& \multicolumn{4}{c|}{\thead{Benchmarks}}
|
||||||
|
& \multicolumn{2}{c|}{\thead{ML}} \\
|
||||||
|
\cline{2-7}
|
||||||
|
~ & \textit{fnaive} & \textit{hets} & \textit{hsma}
|
||||||
|
& \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\
|
||||||
|
\hline \hline
|
||||||
|
3 & 6 & 2 (5) & 1 (3) & 3 (1) & 5 (2) & 4 \\
|
||||||
|
4 & 6 (5) & 2 (3) & 1 (6) & 3 (2) & 5 (1) & 4 \\
|
||||||
|
5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\
|
||||||
|
6 & 6 (5) & 2 & 1 (6) & 4 & 5 (1) & 3 \\
|
||||||
|
7 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
|
||||||
|
8 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
\
|
58
tex/apx/tabular_ml_models.tex
Normal file
58
tex/apx/tabular_ml_models.tex
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
\section{Tabular and Real-time Forecasts without Retraining}
|
||||||
|
\label{tabular_ml_models}
|
||||||
|
|
||||||
|
Regarding the structure of the feature matrix for the ML models in Sub-section
|
||||||
|
\ref{ml_models}, we provide an alternative approach that works without
|
||||||
|
the STL method.
|
||||||
|
Instead of decomposing a time series and arranging the resulting
|
||||||
|
seasonally-adjusted time series $a_t$ into a matrix $\mat{X}$, one can
|
||||||
|
create a matrix with two types of feature columns mapped to the raw
|
||||||
|
observations in $\vec{y}$:
|
||||||
|
While the first group of columns takes all observations of the same time of
|
||||||
|
day over a horizon of, for example, one week ($n_h=7$), the second group
|
||||||
|
takes all observations covering a pre-defined time horizon, for example
|
||||||
|
$3$ hours ($n_r=3$ for 60-minute time steps), preceding the time step to
|
||||||
|
be fitted.
|
||||||
|
Thus, we exploit the two-dimensional structure of time tables as well, and
|
||||||
|
conceptually model historical and recent demand.
|
||||||
|
The alternative feature matrix appears as follows where the first three
|
||||||
|
columns are the historical and the last three the recent demand features:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\vec{y}
|
||||||
|
=
|
||||||
|
\begin{pmatrix}
|
||||||
|
y_T \\
|
||||||
|
y_{T-1} \\
|
||||||
|
\dots \\
|
||||||
|
y_{1+n_hH}
|
||||||
|
\end{pmatrix}
|
||||||
|
~~~~~
|
||||||
|
\mat{X}
|
||||||
|
=
|
||||||
|
\begin{bmatrix}
|
||||||
|
y_{T-H} & y_{T-2H} & \dots & y_{T-n_hH}
|
||||||
|
& y_{T-1} & y_{T-2} & \dots & y_{T-n_r} \\
|
||||||
|
y_{T-1-H} & y_{T-1-2H} & \dots & y_{T-1-n_hH}
|
||||||
|
& y_{T-2} & y_{T-3} & \dots & y_{T-n_r-1} \\
|
||||||
|
\dots & \dots & \dots & \dots
|
||||||
|
& \dots & \dots & \dots & \dots \\
|
||||||
|
y_{1+(n_h-1)H} & y_{1+(n_h-2)H} & \dots & y_1
|
||||||
|
& y^*_{1+n_hH-1} & y^*_{1+n_hH-2} & \dots & y^*_{1+n_hH-n_r}
|
||||||
|
\end{bmatrix}
|
||||||
|
$$
|
||||||
|
\
|
||||||
|
|
||||||
|
Being a detail, we note that the recent demand features lying on the end of
|
||||||
|
the previous day are set to $0$, which is shown with the $^*$ notation
|
||||||
|
above.
|
||||||
|
This alignment of the undecomposed order data $y_t$ ensures that the ML
|
||||||
|
models learn the two seasonal patterns independently.
|
||||||
|
The parameters $n_h$ and $n_r$ must be adapted to the data, but we found the
|
||||||
|
above values to work well.
|
||||||
|
|
||||||
|
As such matrices resemble time tables, we refer to them as tabular.
|
||||||
|
However, we found the ML models with vertical time series to outperform the
|
||||||
|
tabular ML models, which is why we disregarded them in the study.
|
||||||
|
This tabular form could be beneficial for UDPs with a demand that exhibits
|
||||||
|
a weaker seasonality such as a meal delivery platform.
|
53
tex/meta.tex
Normal file
53
tex/meta.tex
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
\begin{frontmatter}
|
||||||
|
|
||||||
|
\journal{Transportation Research Part E}
|
||||||
|
\title{Real-time Demand Forecasting for an Urban Delivery Platform}
|
||||||
|
|
||||||
|
\author[WHU]{Alexander Hess\fnref{emails}\fnref{corresponding}}
|
||||||
|
\author[WHU]{Stefan Spinler\fnref{emails}}
|
||||||
|
\author[MIT]{Matthias Winkenbach\fnref{emails}}
|
||||||
|
\address[WHU]{
|
||||||
|
WHU - Otto Beisheim School of Management,
|
||||||
|
Burgplatz 2, 56179 Vallendar, Germany
|
||||||
|
}
|
||||||
|
\address[MIT]{
|
||||||
|
Massachusetts Institute of Technology,
|
||||||
|
77 Massachusetts Avenue, Cambridge, MA 02139, United States
|
||||||
|
}
|
||||||
|
\fntext[email]{
|
||||||
|
Emails:
|
||||||
|
alexander.hess@whu.edu,
|
||||||
|
stefan.spinler@whu.edu,
|
||||||
|
mwinkenb@mit.edu
|
||||||
|
}
|
||||||
|
|
||||||
|
\fntext[corresponding]{
|
||||||
|
The corresponding author is Alexander Hess.
|
||||||
|
Use the provided email.
|
||||||
|
}
|
||||||
|
|
||||||
|
\begin{abstract}
|
||||||
|
Meal delivery platforms like Uber Eats shape the landscape in cities around the world.
|
||||||
|
This paper addresses forecasting demand on a grid into the short-term future,
|
||||||
|
enabling, for example, predictive routing applications.
|
||||||
|
We propose an approach incorporating
|
||||||
|
both classical forecasting and machine learning methods
|
||||||
|
and adapt model evaluation and selection to typical demand:
|
||||||
|
intermittent with a double-seasonal pattern.
|
||||||
|
An empirical study shows that
|
||||||
|
an exponential smoothing based method trained on past demand data alone
|
||||||
|
achieves optimal accuracy,
|
||||||
|
if at least two months are on record.
|
||||||
|
With a more limited demand history,
|
||||||
|
machine learning is shown
|
||||||
|
to yield more accurate prediction results than classical methods.
|
||||||
|
\end{abstract}
|
||||||
|
|
||||||
|
\begin{keyword}
|
||||||
|
demand forecasting \sep
|
||||||
|
intermittent demand \sep
|
||||||
|
machine learning \sep
|
||||||
|
urban delivery platform
|
||||||
|
\end{keyword}
|
||||||
|
|
||||||
|
\end{frontmatter}
|
24
tex/preamble.tex
Normal file
24
tex/preamble.tex
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
% Use the document width more effectively.
|
||||||
|
\usepackage[margin=2.5cm]{geometry}
|
||||||
|
|
||||||
|
\usepackage[acronym]{glossaries}
|
||||||
|
\makeglossaries
|
||||||
|
|
||||||
|
% Enable captions for figures and tables.
|
||||||
|
\usepackage{caption}
|
||||||
|
|
||||||
|
% Enable diagonal lines in tables.
|
||||||
|
\usepackage{static/slashbox}
|
||||||
|
|
||||||
|
% Enable multiple lines in a table row
|
||||||
|
\usepackage{multirow}
|
||||||
|
|
||||||
|
% Make opening quotes look different than closing quotes.
|
||||||
|
\usepackage[english=american]{csquotes}
|
||||||
|
\MakeOuterQuote{"}
|
||||||
|
|
||||||
|
% Define helper commands.
|
||||||
|
\usepackage{bm}
|
||||||
|
\newcommand{\mat}[1]{\bm{#1}}
|
||||||
|
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
|
||||||
|
\newcommand{\thead}[1]{\textbf{#1}}
|
732
tex/references.bib
Normal file
732
tex/references.bib
Normal file
|
@ -0,0 +1,732 @@
|
||||||
|
@article{alcaraz2019,
|
||||||
|
title={Rich vehicle routing problem with last-mile outsourcing decisions},
|
||||||
|
author={Alcaraz, Juan J and Caballero-Arnaldos, Luis and Vales-Alonso, Javier},
|
||||||
|
year={2019},
|
||||||
|
journal={Transportation Research Part E: Logistics and Transportation Review},
|
||||||
|
volume={129},
|
||||||
|
pages={263--286}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{assimakopoulos2000,
|
||||||
|
title={The theta model: a decomposition approach to forecasting},
|
||||||
|
author={Assimakopoulos, Vassilis and Nikolopoulos, Konstantinos},
|
||||||
|
year={2000},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={16},
|
||||||
|
number={4},
|
||||||
|
pages={521--530}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{bao2004,
|
||||||
|
title={Forecasting intermittent demand by SVMs regression},
|
||||||
|
author={Bao, Yukun and Wang, Wen and Zhang, Jinlong},
|
||||||
|
year={2004},
|
||||||
|
booktitle={2004 IEEE International Conference on Systems, Man and Cybernetics},
|
||||||
|
volume={1},
|
||||||
|
pages={461--466}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{barbour2014,
|
||||||
|
title={psd: Adaptive, sine multitaper power spectral density estimation for R},
|
||||||
|
author={Barbour, Andrew J and Parker, Robert L},
|
||||||
|
year={2014},
|
||||||
|
journal={Computers \& Geosciences},
|
||||||
|
volume={63},
|
||||||
|
pages={1--8},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{bell2018,
|
||||||
|
title = {Forecasting at Uber: An Introduction},
|
||||||
|
author={Bell, Franziska and Smyl, Slawek},
|
||||||
|
year={2018},
|
||||||
|
howpublished = {\url{https://eng.uber.com/forecasting-introduction/}},
|
||||||
|
note = {Accessed: 2020-10-01}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{berbeglia2010,
|
||||||
|
title={Dynamic Pickup and Delivery Problems},
|
||||||
|
author={Berbeglia, Gerardo and Cordeau, Jean-Fran{\c{c}}ois
|
||||||
|
and Laporte, Gilbert},
|
||||||
|
year={2010},
|
||||||
|
journal={European Journal of Operational Research},
|
||||||
|
volume={202},
|
||||||
|
number={1},
|
||||||
|
pages={8--15},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{bergmann2020,
|
||||||
|
title={Integrating first-mile pickup and last-mile delivery
|
||||||
|
on shared vehicle routes for efficient urban e-commerce distribution},
|
||||||
|
author={Bergmann, Felix M and Wagner, Stephan M and Winkenbach, Matthias},
|
||||||
|
year={2020},
|
||||||
|
journal={Transportation Research Part B: Methodological},
|
||||||
|
volume={131},
|
||||||
|
pages={26--62},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{box1962,
|
||||||
|
title={Some statistical Aspects of adaptive Optimization and Control},
|
||||||
|
author={Box, George and Jenkins, Gwilym},
|
||||||
|
year={1962},
|
||||||
|
journal={Journal of the Royal Statistical Society. Series B (Methodological)},
|
||||||
|
volume={24},
|
||||||
|
number={2},
|
||||||
|
pages={297--343}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{box1964,
|
||||||
|
title={An Analysis of Transformations},
|
||||||
|
author={Box, George and Cox, David},
|
||||||
|
year={1964},
|
||||||
|
journal={Journal of the Royal Statistical Society. Series B (Methodological)},
|
||||||
|
volume={26},
|
||||||
|
number={2},
|
||||||
|
pages={211--252}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{box1968,
|
||||||
|
title={Some recent Advances in Forecasting and Control},
|
||||||
|
author={Box, George and Jenkins, Gwilym},
|
||||||
|
year={1968},
|
||||||
|
journal={Journal of the Royal Statistical Society.
|
||||||
|
Series C (Applied Statistics)},
|
||||||
|
volume={17},
|
||||||
|
number={2},
|
||||||
|
pages={91--109}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{box2015,
|
||||||
|
title={Time Series Analysis: Forecasting and Control},
|
||||||
|
author={Box, George and Jenkins, Gwilym and Reinsel, Gregory and Ljung, Greta},
|
||||||
|
series={Wiley Series in Probability and Statistics},
|
||||||
|
year={2015},
|
||||||
|
publisher={Wiley}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{breiman1984,
|
||||||
|
title={Classification and Regression Trees},
|
||||||
|
author={Breiman, Leo and Friedman, Jerome and Olshen, R.A.
|
||||||
|
and Stone, Charles},
|
||||||
|
year={1984},
|
||||||
|
publisher={Wadsworth}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{breiman2001,
|
||||||
|
title={Random Forests},
|
||||||
|
author={Breiman, Leo},
|
||||||
|
year={2001},
|
||||||
|
journal={Machine Learning},
|
||||||
|
volume={45},
|
||||||
|
number={1},
|
||||||
|
pages={5--32}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{brockwell2016,
|
||||||
|
title={Introduction to Time Series and Forecasting},
|
||||||
|
author={Brockwell, Peter and Davis, Richard},
|
||||||
|
series={Springer Texts in Statistics},
|
||||||
|
year={2016},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{brown1959,
|
||||||
|
title={Statistical Forecasting for Inventory Control},
|
||||||
|
author={Brown, Robert},
|
||||||
|
year={1959},
|
||||||
|
publisher={McGraw/Hill}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{chen2006a,
|
||||||
|
title={Hourly Water Demand Forecast Model based on Bayesian Least Squares
|
||||||
|
Support Vector Machine},
|
||||||
|
author={Chen, Lei and Zhang, Tu-qiao},
|
||||||
|
year={2006},
|
||||||
|
journal={Journal of Tianjin University},
|
||||||
|
volume={39},
|
||||||
|
number={9},
|
||||||
|
pages={1037--1042}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{chen2006b,
|
||||||
|
title={Hourly Water Demand Forecast Model based on Least Squares Support
|
||||||
|
Vector Machine},
|
||||||
|
author={Chen, Lei and Zhang, Tu-qiao},
|
||||||
|
year={2006},
|
||||||
|
journal={Journal of Harbin Institute of Technology},
|
||||||
|
volume={38},
|
||||||
|
number={9},
|
||||||
|
pages={1528--1530}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{cleveland1990,
|
||||||
|
title={STL: A Seasonal-Trend Decomposition Procedure Based on Loess},
|
||||||
|
author={Cleveland, Robert and Cleveland, Williiam and McRae, Jean
|
||||||
|
and Terpenning, Irma},
|
||||||
|
year={1990},
|
||||||
|
journal={Journal of Official Statistics},
|
||||||
|
volume={6},
|
||||||
|
number={1},
|
||||||
|
pages={3--73}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{croston1972,
|
||||||
|
title={Forecasting and Stock Control for intermittent Demands},
|
||||||
|
author={Croston, J Do},
|
||||||
|
year={1972},
|
||||||
|
journal={Journal of the Operational Research Society},
|
||||||
|
volume={23},
|
||||||
|
number={3},
|
||||||
|
pages={289--303}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{dagum2016,
|
||||||
|
title={Seasonal Adjustment Methods and Real Time Trend-Cycle Estimation},
|
||||||
|
author={Dagum, Estela and Bianconcini, Silvia},
|
||||||
|
series={Statistics for Social and Behavioral Sciences},
|
||||||
|
year={2016},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{dantzig1959,
|
||||||
|
title={The truck dispatching problem},
|
||||||
|
author={Dantzig, George and Ramser, John},
|
||||||
|
year={1959},
|
||||||
|
journal={Management science},
|
||||||
|
volume={6},
|
||||||
|
number={1},
|
||||||
|
pages={80--91},
|
||||||
|
publisher={Informs}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{de2006,
|
||||||
|
title={25 Years of Time Series Forecasting},
|
||||||
|
author={De Gooijer, Jan and Hyndman, Rob},
|
||||||
|
year={2006},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={22},
|
||||||
|
number={3},
|
||||||
|
pages={443--473}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{drucker1997,
|
||||||
|
title={Support Vector Regression Machines},
|
||||||
|
author={Drucker, Harris and Burges, Christopher and Kaufman, Linda
|
||||||
|
and Smola, Alex and Vapnik, Vladimir},
|
||||||
|
year={1997},
|
||||||
|
booktitle={Advances in Neural Information Processing Systems},
|
||||||
|
pages={155--161},
|
||||||
|
organization={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ehmke2018,
|
||||||
|
title={Optimizing for total costs in vehicle routing in urban areas},
|
||||||
|
author={Ehmke, Jan Fabian and Campbell, Ann M and Thomas, Barrett W},
|
||||||
|
year={2018},
|
||||||
|
journal={Transportation Research Part E: Logistics and Transportation Review},
|
||||||
|
volume={116},
|
||||||
|
pages={242--265}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ferrucci2013,
|
||||||
|
title={A pro-active real-time Control Approach for Dynamic Vehicle Routing
|
||||||
|
Problems dealing with the Delivery of urgent Goods},
|
||||||
|
author={Ferrucci, Francesco and Bock, Stefan and Gendreau, Michel},
|
||||||
|
year={2013},
|
||||||
|
journal={European Journal of Operational Research},
|
||||||
|
volume={225},
|
||||||
|
number={1},
|
||||||
|
pages={130--141},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{gardner1985,
|
||||||
|
title={Forecasting Trends in Time Series},
|
||||||
|
author={Gardner, Everette and McKenzie, Ed},
|
||||||
|
year={1985},
|
||||||
|
journal={Management Science},
|
||||||
|
volume={31},
|
||||||
|
number={10},
|
||||||
|
pages={1237--1246}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hansen2006,
|
||||||
|
title={Some Evidence on Forecasting Time-Series with Support Vector Machines},
|
||||||
|
author={Hansen, James and McDonald, James and Nelson, Ray},
|
||||||
|
year={2006},
|
||||||
|
journal={Journal of the Operational Research Society},
|
||||||
|
volume={57},
|
||||||
|
number={9},
|
||||||
|
pages={1053--1063}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{hastie2013,
|
||||||
|
title={The Elements of Statistical Learning: Data Mining, Inference,
|
||||||
|
and Prediction},
|
||||||
|
author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
|
||||||
|
year={2013},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{herrera2010,
|
||||||
|
title={Predictive Models for Forecasting Hourly Urban Water Demand},
|
||||||
|
author={Herrera, Manuel and Torgo, Lu{\'\i}s and Izquierdo, Joaqu{\'\i}n
|
||||||
|
and P{\'e}rez-Garc{\'\i}a, Rafael},
|
||||||
|
year={2010},
|
||||||
|
journal={Journal of Hydrology},
|
||||||
|
volume={387},
|
||||||
|
number={1-2},
|
||||||
|
pages={141--150}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{hirschberg2016,
|
||||||
|
title = {McKinsey: The changing market for food delivery},
|
||||||
|
author={Hirschberg, Carsten and Rajko, Alexander and Schumacher, Thomas
|
||||||
|
and Wrulich, Martin},
|
||||||
|
year={2016},
|
||||||
|
howpublished = "\url{https://www.mckinsey.com/industries/high-tech/
|
||||||
|
our-insights/the-changing-market-for-food-delivery}",
|
||||||
|
note = {Accessed: 2020-10-01}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ho1998,
|
||||||
|
title={The Random Subspace Method for Constructing Decision Forests},
|
||||||
|
author={Ho, Tin Kam},
|
||||||
|
year={1998},
|
||||||
|
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
|
||||||
|
volume={20},
|
||||||
|
number={8},
|
||||||
|
pages={832--844}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{holt1957,
|
||||||
|
title={Forecasting Seasonals and Trends by Exponentially Weighted Moving
|
||||||
|
Averages},
|
||||||
|
author={Holt, Charles},
|
||||||
|
year={1957},
|
||||||
|
journal={ONR Memorandum},
|
||||||
|
volume={52}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hou2018,
|
||||||
|
title={Ride-matching and routing optimisation: Models and a large
|
||||||
|
neighbourhood search heuristic},
|
||||||
|
author={Hou, Liwen and Li, Dong and Zhang, Dali},
|
||||||
|
year={2018},
|
||||||
|
journal={Transportation Research Part E: Logistics and Transportation Review},
|
||||||
|
volume={118},
|
||||||
|
pages={143--162}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hyndman2002,
|
||||||
|
title={A State Space Framework for Automatic Forecasting using Exponential
|
||||||
|
Smoothing Methods},
|
||||||
|
author={Hyndman, Rob and Koehler, Anne and Snyder, Ralph and Grose, Simone},
|
||||||
|
year={2002},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={18},
|
||||||
|
number={3},
|
||||||
|
pages={439--454}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hyndman2003,
|
||||||
|
title={Unmasking the Theta method},
|
||||||
|
author={Hyndman, Rob and Billah, Baki},
|
||||||
|
year={2003},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={19},
|
||||||
|
number={2},
|
||||||
|
pages={287--290}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hyndman2006,
|
||||||
|
title={Another Look at Measures of Forecast Accuracy},
|
||||||
|
author={Hyndman, Rob and Koehler, Anne},
|
||||||
|
year={2006},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={22},
|
||||||
|
number={4},
|
||||||
|
pages={679--688},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hyndman2008a,
|
||||||
|
title={Automatic Time Series Forecasting: The forecast package for R},
|
||||||
|
author={Hyndman, Rob and Khandakar, Yeasmin},
|
||||||
|
year={2008},
|
||||||
|
journal={Journal of Statistical Software},
|
||||||
|
volume={26},
|
||||||
|
number={3}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{hyndman2008b,
|
||||||
|
title={Forecasting with Exponential Smoothing: the State Space Approach},
|
||||||
|
author={Hyndman, Rob and Koehler, Anne and Ord, Keith and Snyder, Ralph},
|
||||||
|
year={2008},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{hyndman2018,
|
||||||
|
title={Forecasting: Principles and Practice},
|
||||||
|
author={Hyndman, Rob and Athanasopoulos, George},
|
||||||
|
year={2018},
|
||||||
|
publisher={OTexts}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ichoua2006,
|
||||||
|
title={Exploiting Knowledge about Future Demands for Real-time Vehicle
|
||||||
|
Dispatching},
|
||||||
|
author={Ichoua, Soumia and Gendreau, Michel and Potvin, Jean-Yves},
|
||||||
|
year={2006},
|
||||||
|
journal={Transportation Science},
|
||||||
|
volume={40},
|
||||||
|
number={2},
|
||||||
|
pages={211--225},
|
||||||
|
publisher={INFORMS}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{janjevic2019,
|
||||||
|
title={Integrating collection-and-delivery points
|
||||||
|
in the strategic design of urban last-mile e-commerce distribution networks},
|
||||||
|
author={Janjevic, Milena and Winkenbach, Matthias and Merch{\'a}n, Daniel},
|
||||||
|
year={2019},
|
||||||
|
journal={Transportation Research Part E: Logistics and Transportation Review},
|
||||||
|
volume={131},
|
||||||
|
pages={37--67},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{janjevic2020,
|
||||||
|
title={Designing Multi-tier, Multi-service-level, and Multi-modal
|
||||||
|
Last-Mile Distribution Networks for Omni-Channel Operations},
|
||||||
|
author={Janjevic, Milena and Merchan, Daniel and Winkenbach, Matthias},
|
||||||
|
year={2020},
|
||||||
|
journal={European Journal of Operational Research},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{kim2016,
|
||||||
|
title={A new Metric of Absolute Percentage Error for Intermittent Demand
|
||||||
|
Forecasts},
|
||||||
|
author={Kim, Sungil and Kim, Heeyoung},
|
||||||
|
year={2016},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={32},
|
||||||
|
number={3},
|
||||||
|
pages={669--679},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{kwiatkowski1992,
|
||||||
|
title={Testing the null hypothesis of stationarity against the alternative of a
|
||||||
|
unit root: How sure are we that economic time series have a unit root?},
|
||||||
|
author={Kwiatkowski, Denis and Phillips, Peter and Schmidt, Peter
|
||||||
|
and Shin, Yongcheol},
|
||||||
|
year={1992},
|
||||||
|
journal={Journal of Econometrics},
|
||||||
|
volume={54},
|
||||||
|
number={1-3},
|
||||||
|
pages={159--178}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{laptev2017,
|
||||||
|
title = {Engineering Extreme Event Forecasting
|
||||||
|
at Uber with Recurrent Neural Networks},
|
||||||
|
author={Laptev, Nikolay and Smyl, Slawek and Shanmugam, Santhosh},
|
||||||
|
year={2017},
|
||||||
|
howpublished = {\url{https://eng.uber.com/neural-networks/}},
|
||||||
|
note = {Accessed: 2020-10-01}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{lemke2010,
|
||||||
|
title={Meta-Learning for Time Series Forecasting and Forecast Combination},
|
||||||
|
author={Lemke, Christiane and Gabrys, Bogdan},
|
||||||
|
year={2010},
|
||||||
|
journal={Neurocomputing},
|
||||||
|
volume={73},
|
||||||
|
number={10-12},
|
||||||
|
pages={2006--2016},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{ma2018,
|
||||||
|
title={Using the Gradient Boosting Decision Tree to Improve the Delineation of
|
||||||
|
Hourly Rain Areas during the Summer from Advanced Himawari Imager Data},
|
||||||
|
author={Ma, Liang and Zhang, Guoping and Lu, Er},
|
||||||
|
year={2018},
|
||||||
|
journal={Journal of Hydrometeorology},
|
||||||
|
volume={19},
|
||||||
|
number={5},
|
||||||
|
pages={761-776},
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{masmoudi2018,
|
||||||
|
title={The dial-a-ride problem with electric vehicles and battery
|
||||||
|
swapping stations},
|
||||||
|
author={Masmoudi, Mohamed Amine and Hosny, Manar and Demir, Emrah
|
||||||
|
and Genikomsakis, Konstantinos N and Cheikhrouhou, Naoufel},
|
||||||
|
year={2018},
|
||||||
|
journal={Transportation research part E: logistics and transportation review},
|
||||||
|
volume={118},
|
||||||
|
pages={392--420}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{mason2000,
|
||||||
|
title={Boosting algorithms as gradient descent},
|
||||||
|
author={Mason, Llew and Baxter, Jonathan and Bartlett, Peter L
|
||||||
|
and Frean, Marcus R},
|
||||||
|
year={2000},
|
||||||
|
booktitle={Advances in neural information processing systems},
|
||||||
|
pages={512--518}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{mueller1997,
|
||||||
|
title={Predicting Time Series with Support Vector Machines},
|
||||||
|
author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar
|
||||||
|
and Sch{\"o}lkopf, Bernhard and Kohlmorgen, Jens and Vapnik, Vladimir},
|
||||||
|
year={1997},
|
||||||
|
booktitle={International Conference on Artificial Neural Networks},
|
||||||
|
pages={999--1004},
|
||||||
|
organization={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{mueller1999,
|
||||||
|
title={Using Support Vector Machines for Time Series Prediction},
|
||||||
|
author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar
|
||||||
|
and Sch{\"o}lkopf, Bernhard and Kohlmorgen, Jens and Vapnik, Vladimir},
|
||||||
|
year={1999},
|
||||||
|
journal={Advances in Kernel Methods — Support Vector Learning},
|
||||||
|
pages={243--254},
|
||||||
|
publisher={MIT, Cambridge, MA, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{ord2017,
|
||||||
|
title={Principles of Business Forecasting},
|
||||||
|
author={Ord, Keith and Fildes, Robert and Kourentzes, Nikos},
|
||||||
|
year={2017},
|
||||||
|
publisher={WESSEX Press}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{pegels1969,
|
||||||
|
title={Exponential Forecasting: Some new variations},
|
||||||
|
author={Pegels, C.},
|
||||||
|
year={1969},
|
||||||
|
journal={Management Science},
|
||||||
|
volume={15},
|
||||||
|
number={5},
|
||||||
|
pages={311--315}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{pillac2013,
|
||||||
|
title={A Review of Dynamic Vehicle Routing Problems},
|
||||||
|
author={Pillac, Victor and Gendreau, Michel and Gu{\'e}ret, Christelle
|
||||||
|
and Medaglia, Andr{\'e}s L},
|
||||||
|
year={2013},
|
||||||
|
journal={European Journal of Operational Research},
|
||||||
|
volume={225},
|
||||||
|
number={1},
|
||||||
|
pages={1--11},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{prestwich2014,
|
||||||
|
title={Mean-based Error Measures for Intermittent Demand Forecasting},
|
||||||
|
author={Prestwich, Steven and Rossi, Roberto and Tarim, Armagan
|
||||||
|
and Hnich, Brahim},
|
||||||
|
year={2014},
|
||||||
|
journal={International Journal of Production Research},
|
||||||
|
volume={52},
|
||||||
|
number={22},
|
||||||
|
pages={6782--6791},
|
||||||
|
publisher={Taylor \& Francis}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{psaraftis2016,
|
||||||
|
title={Dynamic Vehicle Routing Problems: Three Decades and Counting},
|
||||||
|
author={Psaraftis, Harilaos and Wen, Min and Kontovas, Christos},
|
||||||
|
year={2016},
|
||||||
|
journal={Networks},
|
||||||
|
volume={67},
|
||||||
|
number={1},
|
||||||
|
pages={3--31},
|
||||||
|
publisher={Wiley Online Library}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{reyes2018,
|
||||||
|
title={The Meal Delivery Routing Problem},
|
||||||
|
author={Reyes, Damian and Erera, Alan and Savelsbergh, Martin
|
||||||
|
and Sahasrabudhe, Sagar and O’Neil, Ryan},
|
||||||
|
year={2018},
|
||||||
|
journal={Optimization Online}
|
||||||
|
}
|
||||||
|
|
||||||
|
@incollection{scholkopf1998,
|
||||||
|
title={Fast Approximation of Support Vector Kernel Expansions, and an
|
||||||
|
Interpretation of Clustering as Approximation in Feature Spaces},
|
||||||
|
author={Sch{\"o}lkopf, Bernhard and Knirsch, Phil and Smola, Alex
|
||||||
|
and Burges, Chris},
|
||||||
|
year={1998},
|
||||||
|
booktitle={Mustererkennung 1998},
|
||||||
|
publisher={Springer},
|
||||||
|
pages={125--132}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{singleton2017,
|
||||||
|
title={Urban Analytics},
|
||||||
|
author={Singleton, Alex David and Spielman, Seth and Folch, David},
|
||||||
|
year={2017},
|
||||||
|
publisher={Sage}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{smola2004,
|
||||||
|
title={A Tutorial on Support Vector Regression},
|
||||||
|
author={Smola, Alex and Sch{\"o}lkopf, Bernhard},
|
||||||
|
year={2004},
|
||||||
|
journal={Statistics and Computing},
|
||||||
|
volume={14},
|
||||||
|
number={3},
|
||||||
|
pages={199--222}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{snoeck2020,
|
||||||
|
title={The value of physical distribution flexibility
|
||||||
|
in serving dense and uncertain urban markets},
|
||||||
|
author={Snoeck, Andr{\'e} and Winkenbach, Matthias},
|
||||||
|
year={2020},
|
||||||
|
journal={Transportation Research Part A: Policy and Practice},
|
||||||
|
volume={136},
|
||||||
|
pages={151--177},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{stitson1999,
|
||||||
|
title={Support Vector Regression with ANOVA Decomposition Kernels},
|
||||||
|
author={Stitson, Mark and Gammerman, Alex and Vapnik, Vladimir
|
||||||
|
and Vovk, Volodya and Watkins, Chris and Weston, Jason},
|
||||||
|
year={1999},
|
||||||
|
journal={Advances in Kernel Methods — Support Vector Learning},
|
||||||
|
pages={285--292},
|
||||||
|
publisher={MIT, Cambridge, MA, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{syntetos2005,
|
||||||
|
title={The Accuracy of Intermittent Demand Estimates},
|
||||||
|
author={Syntetos, Aris and Boylan, John},
|
||||||
|
year={2005},
|
||||||
|
journal={International Journal of forecasting},
|
||||||
|
volume={21},
|
||||||
|
number={2},
|
||||||
|
pages={303--314},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{taylor2003,
|
||||||
|
title={Exponential Smoothing with a Damped Multiplicative Trend},
|
||||||
|
author={Taylor, James},
|
||||||
|
year={2003},
|
||||||
|
journal={International Journal of Forecasting},
|
||||||
|
volume={19},
|
||||||
|
number={4},
|
||||||
|
pages={715--725}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{thomas2010,
|
||||||
|
title={Dynamic vehicle routing},
|
||||||
|
author={Thomas, Barrett W},
|
||||||
|
year={2010},
|
||||||
|
journal={Wiley Encyclopedia of Operations Research and Management Science},
|
||||||
|
publisher={Wiley Online Library}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{toth2014,
|
||||||
|
title={Vehicle Routing: Problems, Methods, and Applications},
|
||||||
|
author={Toth, Paolo and Vigo, Daniele},
|
||||||
|
year={2014},
|
||||||
|
publisher={SIAM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@techreport{ulmer2017,
|
||||||
|
title={The Restaurant Meal Delivery Problem: Dynamic Pick-up and Delivery with
|
||||||
|
Deadlines and Random Ready Times},
|
||||||
|
author={Ulmer, Marlin and Thomas, Barrett and Campbell, Ann Melissa
|
||||||
|
and Woyak, Nicholas},
|
||||||
|
year={2017},
|
||||||
|
institution={Technical Report}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{vapnik1963,
|
||||||
|
title={Pattern Recognition using Generalized Portrait Method},
|
||||||
|
author={Vapnik, Vladimir and Lerner, A},
|
||||||
|
year={1963},
|
||||||
|
journal={Automation and Remote Control},
|
||||||
|
volume={24},
|
||||||
|
pages={774--780},
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{vapnik1964,
|
||||||
|
title={A Note on one Class of Perceptrons},
|
||||||
|
author={Vapnik, Vladimir and Chervonenkis, A},
|
||||||
|
year={1964},
|
||||||
|
journal={Automation and Remote Control},
|
||||||
|
volume={25}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{vapnik2013,
|
||||||
|
title={The Nature of Statistical Learning Theory},
|
||||||
|
author={Vapnik, Vladimir},
|
||||||
|
year={2013},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{wang2009,
|
||||||
|
title={Rule Induction for Forecasting Method Selection:
|
||||||
|
Meta-learning the Characteristics of Univariate Time Series},
|
||||||
|
author={Wang, Xiaozhe and Smith-Miles, Kate and Hyndman, Rob},
|
||||||
|
year={2009},
|
||||||
|
journal={Neurocomputing},
|
||||||
|
volume={72},
|
||||||
|
number={10-12},
|
||||||
|
pages={2581--2594},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{wang2018,
|
||||||
|
title={Delivering meals for multiple suppliers: Exclusive or sharing
|
||||||
|
logistics service},
|
||||||
|
author={Wang, Zheng},
|
||||||
|
year={2018},
|
||||||
|
journal={Transportation Research Part E: Logistics and Transportation Review},
|
||||||
|
volume={118},
|
||||||
|
pages={496--512}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{winkenbach2015,
|
||||||
|
title={Enabling urban logistics services at La Poste through
|
||||||
|
multi-echelon location-routing},
|
||||||
|
author={Winkenbach, Matthias and Kleindorfer, Paul R and Spinler, Stefan},
|
||||||
|
year={2015},
|
||||||
|
journal={Transportation Science},
|
||||||
|
volume={50},
|
||||||
|
number={2},
|
||||||
|
pages={520--540},
|
||||||
|
publisher={INFORMS}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{winters1960,
|
||||||
|
title={Forecasting Sales by Exponentially Weighted Moving Averages},
|
||||||
|
author={Winters, Peter},
|
||||||
|
year={1960},
|
||||||
|
journal={Management Science},
|
||||||
|
volume={6},
|
||||||
|
number={3},
|
||||||
|
pages={324--342}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{yildiz2018,
|
||||||
|
title={Provably High-Quality Solutions for the Meal Delivery Routing Problem},
|
||||||
|
author={Yildiz, Baris and Savelsbergh, Martin},
|
||||||
|
year={2018},
|
||||||
|
journal={Optimization Online}
|
||||||
|
}
|
Loading…
Reference in a new issue