diff --git a/paper.tex b/paper.tex index 04a42f1..78c1cd5 100644 --- a/paper.tex +++ b/paper.tex @@ -19,6 +19,16 @@ \input{tex/2_lit/3_ml/4_rf} \input{tex/2_lit/3_ml/5_svm} \input{tex/3_mod/1_intro} +\input{tex/3_mod/2_overall} +\input{tex/3_mod/3_grid} +\input{tex/3_mod/4_cv} +\input{tex/3_mod/5_mase} +\input{tex/3_mod/6_decomp} +\input{tex/3_mod/7_models/1_intro} +\input{tex/3_mod/7_models/2_hori} +\input{tex/3_mod/7_models/3_vert} +\input{tex/3_mod/7_models/4_rt} +\input{tex/3_mod/7_models/5_ml} \input{tex/4_stu/1_intro} \input{tex/5_con/1_intro} @@ -29,6 +39,10 @@ \appendix \newpage +\input{tex/apx/tabular_ml_models} +\newpage +\input{tex/apx/enhanced_feats} +\newpage \bibliographystyle{static/elsarticle-harv} \bibliography{tex/references} diff --git a/static/cross_validation_gray.png b/static/cross_validation_gray.png new file mode 100644 index 0000000..7deee21 Binary files /dev/null and b/static/cross_validation_gray.png differ diff --git a/static/gridification_for_paris_gray.png b/static/gridification_for_paris_gray.png new file mode 100644 index 0000000..78c72ca Binary files /dev/null and b/static/gridification_for_paris_gray.png differ diff --git a/static/model_inputs_gray.png b/static/model_inputs_gray.png new file mode 100644 index 0000000..67cc931 Binary files /dev/null and b/static/model_inputs_gray.png differ diff --git a/static/slashbox.sty b/static/slashbox.sty new file mode 100644 index 0000000..1712c9e --- /dev/null +++ b/static/slashbox.sty @@ -0,0 +1,77 @@ +% slashbox.sty by Koichi Yasuoka, May 27, 1993 +% minor modification by Toru Sato, May 31, 1993 +\typeout{slashbox style by K.Yasuoka, May 1993.}% +\newbox\@slashboxa +\newbox\@slashboxb +\newbox\@slashboxc +\newcount\@slashboxwd +\newcount\@slashboxht +\newdimen\@slashsepl +\newdimen\@slashsepr +\def\slashbox{% + \def\@slashboxpicture##1{% + \put(0,0){\line(##1,1){\@slashboxwd}}% + \put(0,\@slashboxht){\makebox(0,0)[tl]{\box\@slashboxa}}% + \put(\@slashboxwd,0){\makebox(0,0)[br]{\box\@slashboxb}}% + }% + \@slashbox +}% +\def\backslashbox{% + \def\@slashboxpicture##1{% + \put(0,\@slashboxht){\line(##1,-1){\@slashboxwd}}% + \put(0,0){\makebox(0,0)[bl]{\box\@slashboxa}}% + \put(\@slashboxwd,\@slashboxht){\makebox(0,0)[tr]{\box\@slashboxb}}% + }% + \@slashbox +}% +\def\@slashbox{\@ifnextchar [{\@@slashbox}{\@@slashbox[0pt]}} +\def\@@slashbox[#1]{\@ifnextchar [{\@@@slashbox[#1]}{\@@@slashbox[#1][c]}} +\def\@@@slashbox[#1][#2]#3#4{% +% #1: width, #2: suppression of \tabcolsep on `l', `r', or `lr' side +% #3: left item, #4: right item + \@slashsepl=\tabcolsep + \@slashsepr=\tabcolsep + \@tfor\@tempa :=#2\do{\expandafter\let + \csname @slashsep\@tempa\endcsname=\z@}% + \setbox\@slashboxa=\hbox{\strut\hskip\tabcolsep\shortstack[l]{#3}}% + \setbox\@slashboxb=\hbox{\shortstack[r]{#4}\hskip\tabcolsep\strut}% + \setbox\@slashboxa=\hbox{\raise\dp\@slashboxa\box\@slashboxa}% + \setbox\@slashboxb=\hbox{\raise\dp\@slashboxb\box\@slashboxb}% + \setbox\@slashboxc=\hbox{% + \@tempdima=\wd\@slashboxa + \advance\@tempdima by \wd\@slashboxb + \advance\@tempdima by \@slashsepl + \advance\@tempdima by \@slashsepr + \@tempdimb=#1\relax% + \ifdim\@tempdimb>\@tempdima \@tempdima=\@tempdimb\fi% + \@tempdimb=\ht\@slashboxa + \advance\@tempdimb by \dp\@slashboxa + \advance\@tempdimb by \ht\@slashboxb + \advance\@tempdimb by \dp\@slashboxb + \@tempcnta=\@tempdima + \@tempcntb=\@tempdimb + \advance\@tempcnta by \@tempcntb + \advance\@tempcnta by -1 + \divide\@tempcnta by \@tempcntb + \ifnum\@tempcnta>6 \@tempcnta=6 + \@tempdimb=0.166666666\@tempdima + \else + \ifnum\@tempcnta<1 \@tempcnta=1\fi + \@tempdima=\@tempdimb + \multiply\@tempdima by \@tempcnta + \fi% + \advance\@tempdima by -\@slashsepl + \advance\@tempdima by -\@slashsepr + \@slashboxwd=\@tempdima + \@slashboxht=\@tempdimb + \@tempcntb=\@slashsepl + \setlength{\unitlength}{1sp}% + \begin{picture}(\@slashboxwd,\@slashboxht)(\@tempcntb,0) + \advance\@tempdima by \@slashsepl + \advance\@tempdima by \@slashsepr + \@slashboxwd=\@tempdima + \@slashboxpicture{\@tempcnta} + \end{picture}% + }% + $\vcenter{\box\@slashboxc}$% +}% diff --git a/static/stl_gray.png b/static/stl_gray.png new file mode 100644 index 0000000..58b0882 Binary files /dev/null and b/static/stl_gray.png differ diff --git a/tex/3_mod/1_intro.tex b/tex/3_mod/1_intro.tex index fdc9207..7ad6253 100644 --- a/tex/3_mod/1_intro.tex +++ b/tex/3_mod/1_intro.tex @@ -1,8 +1,6 @@ \section{Model Formulation} \label{mod} -% temporary placeholders -\label{decomp} -\label{f:stl} -\label{mase} -\label{unified_cv} \ No newline at end of file +In this section, we describe how the platform's raw data are pre-processed + into model inputs and how the forecasting models are built and benchmarked + against each other. diff --git a/tex/3_mod/2_overall.tex b/tex/3_mod/2_overall.tex new file mode 100644 index 0000000..e3a8d72 --- /dev/null +++ b/tex/3_mod/2_overall.tex @@ -0,0 +1,28 @@ +\subsection{Overall Approach} +\label{approach_approach} + +On a conceptual level, there are three distinct aspects of the model + development process. +First, a pre-processing step transforms the platform's tabular order data into + either time series in Sub-section \ref{grid} or feature matrices in + Sub-section \ref{ml_models}. +Second, a benchmark methodology is developed in Sub-section \ref{unified_cv} + that compares all models on the same scale, in particular, classical + models with ML ones. +Concretely, the CV approach is adapted to the peculiar requirements of + sub-daily and ordinal time series data. +This is done to maximize the predictive power of all models into the future + and to compare them on the same scale. +Third, the forecasting models are described with respect to their assumptions + and training requirements. +Four classification dimensions are introduced: +\begin{enumerate} +\item \textbf{Timeliness of the Information}: + whole-day-ahead vs. real-time forecasts +\item \textbf{Time Series Decomposition}: raw vs. decomposed +\item \textbf{Algorithm Type}: "classical" statistics vs. ML +\item \textbf{Data Sources}: pure vs. enhanced (i.e., with external data) +\end{enumerate} +Not all of the possible eight combinations are implemented; instead, the + models are varied along these dimensions to show different effects and + answer the research questions. diff --git a/tex/3_mod/3_grid.tex b/tex/3_mod/3_grid.tex new file mode 100644 index 0000000..baa0828 --- /dev/null +++ b/tex/3_mod/3_grid.tex @@ -0,0 +1,95 @@ +\subsection{Gridification, Time Tables, and Time Series Generation} +\label{grid} + +The platform's tabular order data are sliced with respect to both location and + time and then aggregated into time series where an observation tells + the number of orders in an area for a time step/interval. +Figure \ref{f:grid} shows how the orders' delivery locations are each + matched to a square-shaped cell, referred to as a pixel, on a grid + covering the entire service area within a city. +This gridification step is also applied to the pickup locations separately. +The lower-left corner is chosen at random. +\cite{winkenbach2015} apply the same gridification idea and slice an urban + area to model a location-routing problem, and \cite{singleton2017} portray + it as a standard method in the field of urban analytics. +With increasing pixel sizes, the time series exhibit more order aggregation + with a possibly stronger demand pattern. +On the other hand, the larger the pixels, the less valuable become the + generated forecasts as, for example, a courier sent to a pixel + preemptively then faces a longer average distance to a restaurant in the + pixel. + +\begin{center} +\captionof{figure}{Gridification for delivery locations in Paris with a pixel + size of $1~\text{km}^2$} +\label{f:grid} +\includegraphics[width=.8\linewidth]{static/gridification_for_paris_gray.png} +\end{center} + +After gridification, the ad-hoc orders within a pixel are aggregated by their + placement timestamps into sub-daily time steps of pre-defined lengths + to obtain a time table as exemplified in Figure \ref{f:timetable} with + one-hour intervals. + +\begin{center} +\captionof{figure}{Aggregation into a time table with hourly time steps} +\label{f:timetable} +\begin{tabular}{|c||*{9}{c|}} + \hline + \backslashbox{Time}{Day} & \makebox[2em]{\ldots} + & \makebox[3em]{Mon} & \makebox[3em]{Tue} + & \makebox[3em]{Wed} & \makebox[3em]{Thu} + & \makebox[3em]{Fri} & \makebox[3em]{Sat} + & \makebox[3em]{Sun} & \makebox[2em]{\ldots} \\ + \hline + \hline + 11:00 & \ldots & $y_{11,Mon}$ & $y_{11,Tue}$ & $y_{11,Wed}$ & $y_{11,Thu}$ + & $y_{11,Fri}$ & $y_{11,Sat}$ & $y_{11,Sun}$ & \ldots \\ + \hline + 12:00 & \ldots & $y_{12,Mon}$ & $y_{12,Tue}$ & $y_{12,Wed}$ & $y_{12,Thu}$ + & $y_{12,Fri}$ & $y_{12,Sat}$ & $y_{12,Sun}$ & \ldots \\ + \hline + \ldots & \ldots & \ldots & \ldots & \ldots + & \ldots & \ldots & \ldots & \ldots & \ldots \\ + \hline + 20:00 & \ldots & $y_{20,Mon}$ & $y_{20,Tue}$ & $y_{20,Wed}$ & $y_{20,Thu}$ + & $y_{20,Fri}$ & $y_{20,Sat}$ & $y_{20,Sun}$ & \ldots \\ + \hline + 21:00 & \ldots & $y_{21,Mon}$ & $y_{21,Tue}$ & $y_{21,Wed}$ & $y_{21,Thu}$ + & $y_{21,Fri}$ & $y_{21,Sat}$ & $y_{21,Sun}$ & \ldots \\ + \hline + \ldots & \ldots & \ldots & \ldots & \ldots + & \ldots & \ldots & \ldots & \ldots & \ldots \\ + \hline +\end{tabular} +\end{center} +\ + +Consequently, each $y_{t,d}$ in Figure \ref{f:timetable} is the number of + all orders within the pixel for the time of day $t$ and day of week + $d$ ($y_t$ and $y_{t,d}$ are the same but differ in that the latter + acknowledges a 2D view). +The same trade-off as with gridification applies: +The shorter the interval, the weaker is the demand pattern to be expected in + the time series due to less aggregation while longer intervals lead to + less usable forecasts. +We refer to time steps by their start time, and their number per day, $H$, + is constant. +Given a time table as in Figure \ref{f:timetable} there are two ways to + generate a time series by slicing: +\begin{enumerate} + \item \textbf{Horizontal View}: + Take only the order counts for a given time of the day + \item \textbf{Vertical View}: + Take all order counts and remove the double-seasonal pattern induced + by the weekday and time of the day with decomposition +\end{enumerate} +Distinct time series are retrieved by iterating through the time tables either + horizontally or vertically in increments of a single time step. +Another property of a generated time series is its length, which, following + the next sub-section, can be interpreted as the sum of the production + training set and the test day. +In summary, a distinct time series is generated from the tabular order data + based on a configuration of parameters for the dimensions pixel size, + number of daily time steps $H$, shape (horizontal vs. vertical), length, + and the time step to be predicted. diff --git a/tex/3_mod/4_cv.tex b/tex/3_mod/4_cv.tex new file mode 100644 index 0000000..d3967c2 --- /dev/null +++ b/tex/3_mod/4_cv.tex @@ -0,0 +1,86 @@ +\subsection{Unified Cross-Validation and Training, Validation, and Test Sets} +\label{unified_cv} + +The standard $k$-fold CV, which assumes no structure in the individual + features of the samples, as shown in $\mat{X}$ above, is adapted to the + ordinal character of time series data: +A model must be evaluated on observations that occurred strictly after the + ones used for training as, otherwise, the model knows about the future. +Furthermore, some models predict only a single to a few time steps before + being retrained, while others predict an entire day without retraining + (cf., Sub-section \ref{ml_models}). +Consequently, we must use a unified time interval wherein all forecasts are + made first before the entire interval is evaluated. +As whole days are the longest prediction interval for models without + retraining, we choose that as the unified time interval. +In summary, our CV methodology yields a distinct best model per pixel and day + to be forecast. +Whole days are also practical for managers who commonly monitor, for example, + the routing and thus the forecasting performance on a day-to-day basis. +Our methodology assumes that the models are trained at least once per day. +As we create operational forecasts into the near future in this paper, + retraining all models with the latest available data is a logical step. + +\begin{center} +\captionof{figure}{Training, validation, and test sets + during cross validation} +\label{f:cv} +\includegraphics[width=.8\linewidth]{static/cross_validation_gray.png} +\end{center} + +The training, validation, and test sets are defined as follows. +To exemplify the logic, we refer to Figure \ref{f:cv} that shows the calendar + setup (i.e., weekdays on the x-axis) for three days $T_1$, $T_2$, and + $T_3$ (shown in dark gray) for which we generate forecasts. +Each of these days is, by definition, a test day, and the test set comprises + all time series, horizontal or vertical, whose last observation lies on + that day. +With an assumed training horizon of three weeks, the 21 days before each of + the test days constitute the corresponding training sets (shown in lighter + gray on the same rows as $T_1$, $T_2$, and $T_3$). +There are two kinds of validation sets, depending on the decision to be made. +First, if a forecasting method needs parameter tuning, the original training + set is divided into as many equally long series as validation days are + needed to find stable parameters. +The example shows three validation days per test day named $V_n$ (shown + in darker gray below each test day). +The $21 - 3 = 18$ preceding days constitute the training set corresponding to + a validation day. +To obtain the overall validation error, the three errors are averaged. +We call these \textit{inner} validation sets because they must be repeated + each day to re-tune the parameters and because the involved time series + are true subsets of the original series. +Second, to find the best method per day and pixel, the same averaging logic + is applied on the outer level. +For example, if we used two validation days to find the best method for $T_3$, + we would average the errors of $T_1$ and $T_2$ for each method and select + the winner; then, $T_1$ and $T_2$ constitute an \textit{outer} validation + set. +Whereas the number of inner validation days is method-specific and must be + chosen before generating any test day forecasts in the first place, the + number of outer validation days may be varied after the fact and is + determined empirically as we show in Section \ref{stu}. + +Our unified CV approach is also optimized for large-scale production settings, + for example, at companies like Uber. +As \cite{bell2018} note, there is a trade-off as to when each of the + inner time series in the example begins. +While the forecasting accuracy likely increases with more training days, + supporting inner series with increasing lengths, cutting the series + to the same length allows caching the forecasts and errors. +In the example, $V_3$, $V_5$, and $V_7$, as well as $V_6$ and $V_8$ are + identical despite belonging to different inner validation sets. +Caching is also possible on the outer level when searching for an optimal + number of validation days for model selection. +We achieved up to 80\% cache hit ratios in our implementation in the + empirical study, thereby saving computational resources by the same + amount. +Lastly, we assert that our suggested CV, because of its being unified + around whole test days and usage of fix-sized time series, is also + suitable for creating consistent learning curves and, thus, answering + \textbf{Q3} on the relationship between forecast accuracy and amount of + historic data: +We simply increase the length of the outer training set holding the test day + fixed. +Thus, independent of a method's need for parameter tuning, all methods have + the same demand history available for each test day forecast. diff --git a/tex/3_mod/5_mase.tex b/tex/3_mod/5_mase.tex new file mode 100644 index 0000000..173d433 --- /dev/null +++ b/tex/3_mod/5_mase.tex @@ -0,0 +1,87 @@ +\subsection{Accuracy Measures} +\label{mase} + +Choosing an error measure for both model selection and evaluation is not + straightforward when working with intermittent demand, as shown, for + example, by \cite{syntetos2005}, and one should understand the trade-offs + between measures. +\cite{hyndman2006} provide a study of measures with real-life data taken from + the popular M3-competition and find that most standard measures degenerate + under many scenarios. +They also provide a classification scheme for which we summarize the main + points as they apply to the UDP case: +\begin{enumerate} +\item \textbf{Scale-dependent Errors}: +The error is reported in the same unit as the raw data. +Two popular examples are the root mean square error (RMSE) and mean absolute + error (MAE). +They may be used for model selection and evaluation within a pixel, and are + intuitively interpretable; however, they may not be used to compare errors + of, for example, a low-demand pixel (e.g., at the UDP's service + boundary) with that of a high-demand pixel (e.g., downtown). +\item \textbf{Percentage Errors}: +The error is derived from the percentage errors of individual forecasts per + time step, and is also intuitively interpretable. +A popular example is the mean absolute percentage error (MAPE) that is the + primary measure in most forecasting competitions. +Whereas such errors could be applied both within and across pixels, they + cannot be calculated reliably for intermittent demand. +If only one time step exhibits no demand, the result is a divide-by-zero + error. +This often occurs even in high-demand pixels due to the slicing. +\item \textbf{Relative Errors}: +A workaround is to calculate a scale-dependent error for the test day and + divide it by the same measure calculated with forecasts of a simple + benchmark method (e.g., na\"{i}ve method). +An example could be + $\text{RelMAE} = \text{MAE} / \text{MAE}_\text{bm}$. +Nevertheless, even simple methods create (near-)perfect forecasts, and then + $\text{MAE}_\text{bm}$ becomes (close to) $0$. +These numerical instabilities occurred so often in our studies that we argue + against using such measures. +\item \textbf{Scaled Errors}: +\cite{hyndman2006} contribute this category and introduce the mean absolute + scaled error (\gls{mase}). +It is defined as the MAE from the actual forecasting method on the test day + (i.e., "out-of-sample") divided by the MAE from the (seasonal) na\"{i}ve + method on the entire training set (i.e., "in-sample"). +A MASE of $1$ indicates that a forecasting method has the same accuracy + on the test day as the (seasonal) na\"{i}ve method applied on a longer + horizon, and lower values imply higher accuracy. +Within a pixel, its results are identical to the ones obtained with MAE. +Also, we acknowledge recent publications, for example, \cite{prestwich2014} or + \cite{kim2016}, showing other ways of tackling the difficulties mentioned. +However, only the MASE provided numerically stable results for all + forecasts in our study. +\end{enumerate} +Consequently, we use the MASE with a seasonal na\"{i}ve benchmark as the + primary measure in this paper. +With the previously introduced notation, it is defined as follows: +$$ +\text{MASE} +:= +\frac{\text{MAE}_{\text{out-of-sample}}}{\text{MAE}_{\text{in-sample}}} += +\frac{\text{MAE}_{\text{forecasts}}}{\text{MAE}_{\text{training}}} += +\frac{\frac{1}{H} \sum_{h=1}^H |y_{T+h} - \hat{y}_{T+h}|} + {\frac{1}{T-k} \sum_{t=k+1}^T |y_{t} - y_{t-k}|} +$$ +The denominator can only become $0$ if the seasonal na\"{i}ve benchmark makes + a perfect forecast on each day in the training set except the first seven + days, which never happened in our case study involving hundreds of + thousands of individual model trainings. +Further, as per the discussion in the subsequent Section \ref{decomp}, we also + calculate peak-MASEs where we leave out the time steps of non-peak times + from the calculations. +For this analysis, we define all time steps that occur at lunch (i.e., noon to + 2 pm) and dinner time (i.e., 6 pm to 8 pm) as peak. +As time steps in non-peak times typically average no or very low order counts, + a UDP may choose to not actively forecast these at all and be rather + interested in the accuracies of forecasting methods during peaks only. + +We conjecture that percentage error measures may be usable for UDPs facing a + higher overall demand with no intra-day down-times in between but have to + leave that to a future study. +Yet, even with high and steady demand, divide-by-zero errors are likely to + occur. \ No newline at end of file diff --git a/tex/3_mod/6_decomp.tex b/tex/3_mod/6_decomp.tex new file mode 100644 index 0000000..62cb78b --- /dev/null +++ b/tex/3_mod/6_decomp.tex @@ -0,0 +1,76 @@ +\subsection{Time Series Decomposition} +\label{decomp} + +Concerning the time table in Figure \ref{f:timetable}, a seasonal demand + pattern is inherent to both horizontal and vertical time series. +First, the weekday influences if people eat out or order in with our partner + receiving more orders on Thursday through Saturday than the other four + days. +This pattern is part of both types of time series. +Second, on any given day, demand peaks occur around lunch and dinner times. +This only regards vertical series. +Statistical analyses show that horizontally sliced time series indeed exhibit + a periodicity of $k=7$, and vertically sliced series only yield a seasonal + component with a regular pattern if the periodicity is set to the product + of the number of weekdays and the daily time steps indicating a distinct + intra-day pattern per weekday. + +Figure \ref{f:stl} shows three exemplary STL decompositions for a + $1~\text{km}^2$ pixel and a vertical time series with 60-minute time steps + (on the x-axis) covering four weeks: +With the noisy raw data $y_t$ on the left, the seasonal and trend components, + $s_t$ and $t_t$, are depicted in light and dark gray for increasing $ns$ + parameters. +The plots include (seasonal) na\"{i}ve forecasts for the subsequent test day + as dotted lines. +The remainder components $r_t$ are not shown for conciseness. +The periodicity is set to $k = 7 * 12 = 84$ as our industry partner has $12$ + opening hours per day. + +\begin{center} +\captionof{figure}{STL decompositions for a medium-demand pixel with hourly + time steps and periodicity $k=84$} +\label{f:stl} +\includegraphics[width=.95\linewidth]{static/stl_gray.png} +\end{center} + +As described in Sub-section \ref{stl}, with $k$ being implied by the + application, at the very least, the length of the seasonal smoothing + window, represented by the $ns$ parameter, must be calibrated by the + forecaster: +It controls how many past observations go into each smoothened $s_t$. +Many practitioners, however, skip this step and set $ns$ to a big number, for + example, $999$, then referred to as "periodic." +For the other parameters, it is common to use the default values as + specified in \cite{cleveland1990}. +The goal is to find a decomposition with a regular pattern in $s_t$. +In Figure \ref{f:stl}, this is not true for $ns=7$ where, for + example, the four largest bars corresponding to the same time of day a + week apart cannot be connected by an approximately straight line. +On the contrary, a regular pattern in the most extreme way exists for + $ns=999$, where the same four largest bars are of the same height. +This observation holds for each time step of the day. +For $ns=11$, $s_t$ exhibits a regular pattern whose bars adapt over time: +The pattern is regular as bars corresponding to the same time of day can be + connected by approximately straight lines, and it is adaptive as these + lines are not horizontal. +The trade-off between small and large values for $ns$ can thus be interpreted + as allowing the average demand during peak times to change over time: +If demand is intermittent at non-peak times, it is reasonable to expect the + bars to change over time as only the relative differences between peak and + non-peak times impact the bars' heights with the seasonal component being + centered around $0$. +To confirm the goodness of a decomposition statistically, one way is to verify + that $r_t$ can be modeled as a typical error process like white noise + $\epsilon_t$. + +However, we suggest an alternative way of calibrating the STL method in an + automated fashion based on our unified CV approach. +As hinted at in Figure \ref{f:stl}, we interpret an STL decomposition as a + forecasting method on its own by just adding the (seasonal) na\"{i}ve + forecasts for $s_t$ and $t_t$ and predicting $0$ for $r_t$. +Then, the $ns$ parameter is tuned just like a parameter for an ML model. +To the best of our knowledge, this has not yet been proposed before. +Conceptually, forecasting with the STL method can be viewed as a na\"{i}ve + method with built-in smoothing, and it outperformed all other + benchmark methods in all cases. diff --git a/tex/3_mod/7_models/1_intro.tex b/tex/3_mod/7_models/1_intro.tex new file mode 100644 index 0000000..7f02444 --- /dev/null +++ b/tex/3_mod/7_models/1_intro.tex @@ -0,0 +1,20 @@ +\subsection{Forecasting Models} +\label{models} + +This sub-section describes the concrete models in our study. +Figure \ref{f:inputs} shows how we classify them into four families with + regard to the type of the time series, horizontal or vertical, and the + moment at which a model is trained: +Solid lines indicate that the corresponding time steps lie before the + training, and dotted lines show the time horizon predicted by a model. +For conciseness, we only show the forecasts for one test day. +The setup is the same for each inner validation day. + +\ + +\begin{center} +\captionof{figure}{Classification of the models by input type and training + moment} +\label{f:inputs} +\includegraphics[width=.95\linewidth]{static/model_inputs_gray.png} +\end{center} diff --git a/tex/3_mod/7_models/2_hori.tex b/tex/3_mod/7_models/2_hori.tex new file mode 100644 index 0000000..5d5959a --- /dev/null +++ b/tex/3_mod/7_models/2_hori.tex @@ -0,0 +1,42 @@ +\subsubsection{Horizontal and Whole-day-ahead Forecasts.} +\label{hori} + +The upper-left in Figure \ref{f:inputs} illustrates the simplest way to + generate forecasts for a test day before it has started: +For each time of the day, the corresponding horizontal slice becomes the input + for a model. +With whole days being the unified time interval, each model is trained $H$ + times, providing a one-step-ahead forecast. +While it is possible to have models of a different type be selected per time + step, that did not improve the accuracy in the empirical study. +As the models in this family do not include the test day's demand data in + their training sets, we see them as benchmarks to answer \textbf{Q4}, + checking if a UDP can take advantage of real-time information. +The models in this family are as follows; we use prefixes, such as "h" here, + when methods are applied in other families as well: +\begin{enumerate} +\item \textit{\gls{naive}}: + Observation from the same time step one week prior +\item \textit{\gls{trivial}}: + Predict $0$ for all time steps +\item \textit{\gls{hcroston}}: + Intermittent demand method introduced by \cite{croston1972} +\item \textit{\gls{hholt}}, + \textit{\gls{hhwinters}}, + \textit{\gls{hses}}, + \textit{\gls{hsma}}, and + \textit{\gls{htheta}}: + Exponential smoothing without calibration +\item \textit{\gls{hets}}: + ETS calibrated as described by \cite{hyndman2008b} +\item \textit{\gls{harima}}: + ARIMA calibrated as described by \cite{hyndman2008a} +\end{enumerate} +\textit{naive} and \textit{trivial} provide an absolute benchmark for the + actual forecasting methods. +\textit{hcroston} is often mentioned in the context of intermittent demand; + however, the method did not perform well at all. +Besides \textit{hhwinters} that always fits a seasonal component, the + calibration heuristics behind \textit{hets} and \textit{harima} may do so + as well. +With $k=7$, an STL decomposition is unnecessary here. diff --git a/tex/3_mod/7_models/3_vert.tex b/tex/3_mod/7_models/3_vert.tex new file mode 100644 index 0000000..5459b81 --- /dev/null +++ b/tex/3_mod/7_models/3_vert.tex @@ -0,0 +1,39 @@ +\subsubsection{Vertical and Whole-day-ahead Forecasts without Retraining.} +\label{vert} + +The upper-right in Figure \ref{f:inputs} shows an alternative way to + generate forecasts for a test day before it has started: +First, a seasonally-adjusted time series $a_t$ is obtained from a vertical + time series by STL decomposition. +Then, the actual forecasting model, trained on $a_t$, makes an $H$-step-ahead + prediction. +Lastly, we add the $H$ seasonal na\"{i}ve forecasts for the seasonal component + $s_t$ to them to obtain the actual predictions for the test day. +Thus, only one training is required per model type, and no real-time data is + used. +By decomposing the raw time series, all long-term patterns are assumed to be + in the seasonal component $s_t$, and $a_t$ only contains the level with + a potential trend and auto-correlations. +The models in this family are: +\begin{enumerate} +\item \textit{\gls{fnaive}}, + \textit{\gls{pnaive}}: + Sum of STL's trend and seasonal components' na\"{i}ve forecasts +\item \textit{\gls{vholt}}, + \textit{\gls{vses}}, and + \textit{\gls{vtheta}}: + Exponential smoothing without calibration and seasonal + fit +\item \textit{\gls{vets}}: + ETS calibrated as described by \cite{hyndman2008b} +\item \textit{\gls{varima}}: + ARIMA calibrated as described by \cite{hyndman2008a} +\end{enumerate} +As mentioned in Sub-section \ref{unified_cv}, we include the sum of the + (seasonal) na\"{i}ve forecasts of the STL's trend and seasonal components + as forecasts on their own: +For \textit{fnaive}, we tune the "flexible" $ns$ parameter, and for + \textit{pnaive}, we set it to a "periodic" value. +Thus, we implicitly assume that there is no signal in the remainder $r_t$, and + predict $0$ for it. +\textit{fnaive} and \textit{pnaive} are two more simple benchmarks. diff --git a/tex/3_mod/7_models/4_rt.tex b/tex/3_mod/7_models/4_rt.tex new file mode 100644 index 0000000..75122cc --- /dev/null +++ b/tex/3_mod/7_models/4_rt.tex @@ -0,0 +1,22 @@ +\subsubsection{Vertical and Real-time Forecasts with Retraining.} +\label{rt} + +The lower-left in Figure \ref{f:inputs} shows how models trained on vertical + time series are extended with real-time order data as it becomes available + during a test day: +Instead of obtaining an $H$-step-ahead forecast, we retrain a model after + every time step and only predict one step. +The remainder is as in the previous sub-section, and the models are: +\begin{enumerate} +\item \textit{\gls{rtholt}}, + \textit{\gls{rtses}}, and + \textit{\gls{rttheta}}: + Exponential smoothing without calibration and seasonal fit +\item \textit{\gls{rtets}}: + ETS calibrated as described by \cite{hyndman2008b} +\item \textit{\gls{rtarima}}: + ARIMA calibrated as described by \cite{hyndman2008a} +\end{enumerate} +Retraining \textit{fnaive} and \textit{pnaive} did not increase accuracy, and + thus we left them out. +A downside of this family is the significant increase in computing costs. diff --git a/tex/3_mod/7_models/5_ml.tex b/tex/3_mod/7_models/5_ml.tex new file mode 100644 index 0000000..26f22a6 --- /dev/null +++ b/tex/3_mod/7_models/5_ml.tex @@ -0,0 +1,54 @@ +\subsubsection{Vertical and Real-time Forecasts without Retraining.} +\label{ml_models} + +The lower-right in Figure \ref{f:inputs} shows how ML models take + real-time order data into account without retraining. +Based on the seasonally-adjusted time series $a_t$, we employ the feature + matrix and label vector representations from Sub-section \ref{learning} + and set $n$ to the number of daily time steps, $H$, to cover all potential + auto-correlations. +The ML models are trained once before a test day starts. +For training, the matrix and vector are populated such that $y_T$ is set to + the last time step of the day before the forecasts, $a_T$. +As the splitting during CV is done with whole days, the \gls{ml} models are + trained with training sets consisting of samples from all times of a day + in an equal manner. +Thus, the ML models learn to predict each time of the day. +For prediction on a test day, the $H$ observations preceding the time + step to be forecast are used as the input vector after seasonal + adjustment. +As a result, real-time data are included. +The models in this family are: +\begin{enumerate} +\item \textit{\gls{vrfr}}: RF trained on the matrix as described +\item \textit{\gls{vsvr}}: SVR trained on the matrix as described +\end{enumerate} +We tried other ML models such as gradient boosting machines but found + only RFs and SVRs to perform well in our study. +In the case of gradient boosting machines, this is to be expected as they are + known not to perform well in the presence of high noise - as is natural + with low count data - as shown, for example, by \cite{ma2018} or + \cite{mason2000}. +Also, deep learning methods are not applicable as the feature matrices only + consist of several hundred to thousands of rows (cf., Sub-section + \ref{params}). +In \ref{tabular_ml_models}, we provide an alternative feature matrix + representation that exploits the two-dimensional structure of time tables + without decomposing the time series. +In \ref{enhanced_feats}, we show how feature matrices are extended + to include predictors other than historical order data. +However, to answer \textbf{Q5} already here, none of the external data sources + improves the results in our study. +Due to the high number of time series in our study, to investigate why + no external sources improve the forecasts, we must us some automated + approach to analyzing individual time series. +\cite{barbour2014} provide a spectral density estimation approach, called + the Shannon entropy, that measures the signal-to-noise ratio in a + database with a number normalized between 0 and 1 where lower values + indicate a higher signal-to-noise ratio. +We then looked at averages of the estimates on a daily level per pixel and + find that including any of the external data sources from + \ref{enhanced_feats} always leads to significantly lower signal-to-noise + ratios. +Thus, we conclude that at least for the demand faced by our industry partner + the historical data contains all of the signal. diff --git a/tex/4_stu/1_intro.tex b/tex/4_stu/1_intro.tex index 4a28425..29411c5 100644 --- a/tex/4_stu/1_intro.tex +++ b/tex/4_stu/1_intro.tex @@ -1,2 +1,5 @@ \section{Empirical Study: A Meal Delivery Platform in Europe} -\label{stu} \ No newline at end of file +\label{stu} + +% temporary placeholder +\label{params} \ No newline at end of file diff --git a/tex/apx/case_study.tex b/tex/apx/case_study.tex new file mode 100644 index 0000000..6019a43 --- /dev/null +++ b/tex/apx/case_study.tex @@ -0,0 +1,54 @@ +\section{Raw Order Data in the Case Study} +\label{dataset} + +The raw data for the empirical study in Section \ref{stu} was provided by a + meal delivery platform operating in five cities in France in 2016. +The platform received a total of 686,385 orders distributed as follows: + +\ +\begin{center} +\begin{tabular}{llr} + \hline + \thead{City} & \thead{Launch Day} & \thead{Orders} \\ + \hline + Bordeaux & July 18 & 64,012 \\ + Lille & October 30 & 14,362 \\ + Lyon & February 21 & 214,635 \\ + Nantes & October 31 & 12,900 \\ + Paris & March 7 & 380,476 \\ +\end{tabular} +\end{center} +\ + +The part of the database relevant for forecasting can be thought of as one + table per city, where each row represents one order and consists of the + following groups of columns: +\begin{enumerate} +\item \textbf{Restaurant Data} + \begin{enumerate} + \item unique ID and name + \item pickup location as latitude-longitude pair + \end{enumerate} +\item \textbf{Customer Data} + \begin{enumerate} + \item unique ID, name, and phone number + \item delivery location as latitude-longitude pair (mostly physical + addresses but also public spots) + \end{enumerate} +\item \textbf{Timestamps} + \begin{enumerate} + \item placement via the smartphone app + \item fulfillment workflow (pickup, delivery, cancellation, re-deliveries) + \end{enumerate} +\item \textbf{Courier Data} + \begin{enumerate} + \item unique ID, name, and phone number + \item shift data (begin, breaks, end) + \item average speed + \end{enumerate} +\item \textbf{Order Details} + \begin{enumerate} + \item meals and drinks + \item prices and discounts granted + \end{enumerate} +\end{enumerate} diff --git a/tex/apx/enhanced_feats.tex b/tex/apx/enhanced_feats.tex new file mode 100644 index 0000000..c6ef0f4 --- /dev/null +++ b/tex/apx/enhanced_feats.tex @@ -0,0 +1,121 @@ +\section{Enhancing Forecasting Models with External Data} +\label{enhanced_feats} + +In this appendix, we show how the feature matrix in Sub-section + \ref{ml_models} can be extended with features other than historical order + data. +Then, we provide an overview of what external data we tried out as predictors + in our empirical study. + +\subsection{Enhanced Feature Matrices} + +Feature matrices can naturally be extended by appending new feature columns + $x_{t,f}$ or $x_f$ on the right where the former represent predictors + changing throughout a day and the latter being static either within a + pixel or across a city. +$f$ refers to an external predictor variable, such as one of the examples + listed below. +In the SVR case, the columns should be standardized before fitting as external + predictors are most likely on a different scale than the historic order + data. +Thus, for a matrix with seasonally-adjusted order data $a_t$ in it, an + enhanced matrix looks as follows: + +$$ +\vec{y} += +\begin{pmatrix} + a_T \\ + a_{T-1} \\ + \dots \\ + a_{H+1} +\end{pmatrix} +~~~~~ +\mat{X} += +\begin{bmatrix} + a_{T-1} & a_{T-2} & \dots & a_{T-H} & ~~~ + & x_{T,A} & \dots & x_{B} & \dots \\ + a_{T-2} & a_{T-3} & \dots & a_{T-(H+1)} & ~~~ + & x_{T-1,A} & \dots & x_{B} & \dots \\ + \dots & \dots & \dots & \dots & ~~~ + & \dots & \dots & \dots & \dots \\ + a_H & a_{H-1} & \dots & a_1 & ~~~ + & x_{H+1,A} & \dots & x_{B} & \dots +\end{bmatrix} +$$ +\ + +Similarly, we can also enhance the tabular matrices from + \ref{tabular_ml_models}. +The same comments as for their pure equivalents in Sub-section \ref{ml_models} + apply, in particular, that ML models trained with an enhanced matrix can + process real-time data without being retrained. + +\subsection{External Data in the Empirical Study} +\label{external_data} + +In the empirical study, we tested four groups of external features that we + briefly describe here. + +\vskip 0.1in + +\textbf{Calendar Features}: +\begin{itemize} + \item Time of day (as synthesized integers: e.g., 1,050 for 10:30 am, + or 1,600 for 4 pm) + \item Day of week (as one-hot encoded booleans) + \item Work day or not (as booleans) +\end{itemize} + +\vskip 0.1in + +\textbf{Features derived from the historical Order Data}: +\begin{itemize} + \item Number of pre-orders for a time step (as integers) + \item 7-day SMA of the percentages of discounted orders (as percentages): + The platform is known for running marketing campaigns aimed at + first-time customers at irregular intervals. Consequently, the + order data show a wave-like pattern of coupons redeemed when looking + at the relative share of discounted orders per day. +\end{itemize} + +\vskip 0.1in + +\textbf{Neighborhood Features}: +\begin{itemize} + \item Ambient population (as integers) as obtained from the ORNL LandScan + database + \item Number of active platform restaurants (as integers) + \item Number of overall restaurants, food outlets, retailers, and other + businesses (as integers) as obtained from the Google Maps and Yelp + web services +\end{itemize} + +\vskip 0.1in + +\textbf{Real-time Weather} (raw data obtained from IBM's + Wunderground database): +\begin{itemize} + \item Absolute temperature, wind speed, and humidity + (as decimals and percentages) + \item Relative temperature with respect to 3-day and 7-day historical + means (as decimals) + \item Day vs. night defined by sunset (as booleans) + \item Summarized description (as indicators $-1$, $0$, and $+1$) + \item Lags of the absolute temperature and the summaries covering the + previous three hours +\end{itemize} + +\vskip 0.1in + +Unfortunately, we must report that none of the mentioned external data + improved the accuracy of the forecasts. +Some led to models overfitting the data, which could not be regulated. +Manual tests revealed that real-time weather data are the most promising + external source. +Nevertheless, the data provided by IBM's Wunderground database originate from + weather stations close to airports, which implies that we only have the + same aggregate weather data for the entire city. +If weather data is available on a more granular basis in the future, we see + some potential for exploitation. diff --git a/tex/apx/peak_results.tex b/tex/apx/peak_results.tex new file mode 100644 index 0000000..58b4dd8 --- /dev/null +++ b/tex/apx/peak_results.tex @@ -0,0 +1,261 @@ +\section{Forecasting Accuracies during Peak Times} +\label{peak_results} + +This appendix shows all result tables from the main text with the MASE + averages calculated from time steps within peak times. +Peaks are the times of the day where the typical customer has a lunch or + dinner meal and defined to be either from 12 pm to 2 pm or from 6 pm to + 8 pm. +While the exact decimals of the MASEs differ from the ones in the main + text, the relative ranks of the forecasting methods are the same except in + rare cases. + +\begin{center} +\captionof{table}{Top-3 models by training weeks and average demand + ($1~\text{km}^2$ pixel size, 60-minute time steps)} +\label{t:results:a} +\begin{tabular}{|c|c|*{12}{c|}} + +\hline +\multirow{3}{*}{\rotatebox{90}{\thead{Training}}} + & \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}} + & \multicolumn{3}{c|}{\thead{No Demand}} + & \multicolumn{3}{c|}{\thead{Low Demand}} + & \multicolumn{3}{c|}{\thead{Medium Demand}} + & \multicolumn{3}{c|}{\thead{High Demand}} \\ +~ & ~ + & \multicolumn{3}{c|}{(0 - 2.5)} + & \multicolumn{3}{c|}{(2.5 - 10)} + & \multicolumn{3}{c|}{(10 - 25)} + & \multicolumn{3}{c|}{(25 - $\infty$)} \\ +\cline{3-14} +~ & ~ + & Method & MASE & $n$ + & Method & MASE & $n$ + & Method & MASE & $n$ + & Method & MASE & $n$ \\ + +\hline \hline +\multirow{3}{*}{3} & 1 + & \textbf{\textit{trivial}} + & 0.794 & \multirow{3}{*}{\rotatebox{90}{4586}} + & \textbf{\textit{hsma}} + & 0.817 & \multirow{3}{*}{\rotatebox{90}{2975}} + & \textbf{\textit{hsma}} + & 0.838 & \multirow{3}{*}{\rotatebox{90}{2743}} + & \textbf{\textit{rtarima}} + & 0.871 & \multirow{3}{*}{\rotatebox{90}{2018}} \\ +~ & 2 + & \textit{hsma} & 0.808 & ~ + & \textit{hses} & 0.847 & ~ + & \textit{hses} & 0.851 & ~ + & \textit{rtses} & 0.872 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.938 & ~ + & \textit{hets} & 0.848 & ~ + & \textit{hets} & 0.853 & ~ + & \textit{rtets} & 0.874 & ~ \\ + +\hline +\multirow{3}{*}{4} & 1 + & \textbf{\textit{trivial}} + & 0.791 & \multirow{3}{*}{\rotatebox{90}{4532}} + & \textbf{\textit{hsma}} + & 0.833 & \multirow{3}{*}{\rotatebox{90}{3033}} + & \textbf{\textit{hsma}} + & 0.839 & \multirow{3}{*}{\rotatebox{90}{2687}} + & \textbf{\textit{vrfr}} + & 0.848 & \multirow{3}{*}{\rotatebox{90}{2016}} \\ +~ & 2 + & \textit{hsma} & 0.794 & ~ + & \textit{hses} & 0.838 & ~ + & \textit{hses} & 0.847 & ~ + & \textbf{\textit{rtarima}} & 0.851 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.907 & ~ + & \textit{hets} & 0.841 & ~ + & \textit{hets} & 0.851 & ~ + & \textit{rtses} & 0.857 & ~ \\ + +\hline +\multirow{3}{*}{5} & 1 + & \textbf{\textit{trivial}} + & 0.782 & \multirow{3}{*}{\rotatebox{90}{4527}} + & \textbf{\textit{hsma}} + & 0.844 & \multirow{3}{*}{\rotatebox{90}{3055}} + & \textbf{\textit{hsma}} + & 0.841 & \multirow{3}{*}{\rotatebox{90}{2662}} + & \textbf{\textit{vrfr}} + & 0.849 & \multirow{3}{*}{\rotatebox{90}{2019}} \\ +~ & 2 + & \textit{hsma} & 0.802 & ~ + & \textit{hses} & 0.851 & ~ + & \textit{hets} & 0.844 & ~ + & \textbf{\textit{rtarima}} & 0.851 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.888 & ~ + & \textit{hets} & 0.863 & ~ + & \textit{hses} & 0.845 & ~ + & \textit{vsvr} & 0.853 & ~ \\ + +\hline +\multirow{3}{*}{6} & 1 + & \textbf{\textit{trivial}} + & 0.743 & \multirow{3}{*}{\rotatebox{90}{4470}} + & \textbf{\textit{hsma}} + & 0.843 & \multirow{3}{*}{\rotatebox{90}{3086}} + & \textbf{\textit{hsma}} + & 0.841 & \multirow{3}{*}{\rotatebox{90}{2625}} + & \textbf{\textit{vrfr}} + & 0.844 & \multirow{3}{*}{\rotatebox{90}{2025}} \\ +~ & 2 + & \textit{hsma} & 0.765 & ~ + & \textit{hses} & 0.853 & ~ + & \textit{hses} & 0.844 & ~ + & \textbf{\textit{hets}} & 0.847 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.836 & ~ + & \textit{hets} & 0.861 & ~ + & \textit{hets} & 0.844 & ~ + & \textit{vsvr} & 0.849 & ~ \\ + +\hline +\multirow{3}{*}{7} & 1 + & \textbf{\textit{trivial}} + & 0.728 & \multirow{3}{*}{\rotatebox{90}{4454}} + & \textbf{\textit{hsma}} + & 0.855 & \multirow{3}{*}{\rotatebox{90}{3132}} + & \textbf{\textit{hets}} + & 0.843 & \multirow{3}{*}{\rotatebox{90}{2597}} + & \textbf{\textit{hets}} + & 0.839 & \multirow{3}{*}{\rotatebox{90}{2007}} \\ +~ & 2 + & \textit{hsma} & 0.744 & ~ + & \textit{hses} & 0.862 & ~ + & \textit{hsma} & 0.845 & ~ + & \textbf{\textit{vrfr}} & 0.842 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.812 & ~ + & \textit{hets} & 0.868 & ~ + & \textbf{\textit{vsvr}} & 0.849 & ~ + & \textit{vsvr} & 0.846 & ~ \\ + +\hline +\multirow{3}{*}{8} & 1 + & \textbf{\textit{trivial}} + & 0.736 & \multirow{3}{*}{\rotatebox{90}{4402}} + & \textbf{\textit{hsma}} + & 0.865 & \multirow{3}{*}{\rotatebox{90}{3159}} + & \textbf{\textit{hets}} + & 0.843 & \multirow{3}{*}{\rotatebox{90}{2575}} + & \textbf{\textit{hets}} + & 0.837 & \multirow{3}{*}{\rotatebox{90}{2002}} \\ +~ & 2 + & \textit{hsma} & 0.759 & ~ + & \textit{hets} & 0.874 & ~ + & \textbf{\textit{vsvr}} & 0.848 & ~ + & \textbf{\textit{vrfr}} & 0.841 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.820 & ~ + & \textit{hses} & 0.879 & ~ + & \textit{hsma} & 0.850 & ~ + & \textit{vsvr} & 0.847 & ~ \\ + +\hline +\end{tabular} +\end{center} + +\begin{center} +\captionof{table}{Ranking of benchmark and horizontal models + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:hori:a} +\begin{tabular}{|c|ccc|cccccccc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{3}{c|}{\thead{Benchmarks}} + & \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\ +\cline{2-12} +~ & \textit{naive} & \textit{fnaive} & \textit{paive} + & \textit{harima} & \textit{hcroston} & \textit{hets} & \textit{hholt} + & \textit{hhwinters} & \textit{hses} & \textit{hsma} & \textit{htheta} \\ +\hline \hline +3 & 11 & 7 (2) & 8 (5) & 5 (7) & 4 & 3 + & 9 (10) & 10 (9) & 2 (6) & 1 & 6 (8) \\ +4 & 11 & 7 (2) & 8 (3) & 5 (6) & 4 (5) & 3 (1) + & 9 (10) & 10 (9) & 2 (8) & 1 (4) & 6 (7) \\ +5 & 11 & 7 (2) & 8 (4) & 5 (3) & 4 (9) & 3 (1) + & 9 (10) & 10 (5) & 2 (8) & 1 (6) & 6 (7) \\ +6 & 11 & 8 (5) & 9 (6) & 5 (4) & 4 (7) & 2 (1) + & 10 & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +7 & 11 & 8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1) + & 9 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +8 & 11 & 9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1) + & 8 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +\hline +\end{tabular} +\end{center} +\ + +\begin{center} +\captionof{table}{Ranking of classical models on vertical time series + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:vert:a} +\begin{tabular}{|c|cc|ccccc|ccccc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{2}{c|}{\thead{Benchmarks}} + & \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}} + & \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\ +\cline{2-13} +~ & \textit{hets} & \textit{hsma} & \textit{varima} & \textit{vets} + & \textit{vholt} & \textit{vses} & \textit{vtheta} & \textit{rtarima} + & \textit{rtets} & \textit{rtholt} & \textit{rtses} & \textit{rttheta} \\ +\hline \hline +3 & 2 (10) & 1 (7) & 6 (4) & 8 (6) & 10 (9) + & 7 (5) & 11 (12) & 4 (1) & 5 (3) & 9 (8) & 3 (2) & 12 (11) \\ +4 & 2 (7) & 1 (10) & 6 (4) & 8 (6) & 10 (9) + & 7 (5) & 12 (11) & 3 (1) & 5 (3) & 9 (8) & 4 (2) & 11 (12) \\ +5 & 2 (3) & 1 (10) & 7 (5) & 8 (7) & 10 (9) + & 6 & 11 & 4 (1) & 5 (4) & 9 (8) & 3 (2) & 12 \\ +6 & 2 (1) & 1 (10) & 6 (5) & 8 (7) & 10 (9) + & 7 (6) & 11 (12) & 3 (2) & 5 (4) & 9 (8) & 4 (3) & 12 (11) \\ +7 & 2 (1) & 1 (10) & 8 (5) & 7 & 10 (9) + & 6 & 11 (12) & 5 (2) & 4 & 9 (8) & 3 & 12 (11) \\ +8 & 2 (1) & 1 (9) & 8 (5) & 7 & 10 (8) + & 6 & 12 (10) & 5 (2) & 4 & 9 (6) & 3 & 11 \\ +\hline +\end{tabular} +\end{center} +\ + +\pagebreak + +\begin{center} +\captionof{table}{Ranking of ML models on vertical time series + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:ml:a} +\begin{tabular}{|c|cccc|cc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{4}{c|}{\thead{Benchmarks}} + & \multicolumn{2}{c|}{\thead{ML}} \\ +\cline{2-7} +~ & \textit{fnaive} & \textit{hets} & \textit{hsma} + & \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\ +\hline \hline +3 & 6 & 2 (5) & 1 (3) & 3 (1) & 5 (2) & 4 \\ +4 & 6 (5) & 2 (3) & 1 (6) & 3 (2) & 5 (1) & 4 \\ +5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\ +6 & 6 (5) & 2 & 1 (6) & 4 & 5 (1) & 3 \\ +7 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\ +8 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\ +\hline +\end{tabular} +\end{center} +\ diff --git a/tex/apx/tabular_ml_models.tex b/tex/apx/tabular_ml_models.tex new file mode 100644 index 0000000..dc1e5ae --- /dev/null +++ b/tex/apx/tabular_ml_models.tex @@ -0,0 +1,58 @@ +\section{Tabular and Real-time Forecasts without Retraining} +\label{tabular_ml_models} + +Regarding the structure of the feature matrix for the ML models in Sub-section + \ref{ml_models}, we provide an alternative approach that works without + the STL method. +Instead of decomposing a time series and arranging the resulting + seasonally-adjusted time series $a_t$ into a matrix $\mat{X}$, one can + create a matrix with two types of feature columns mapped to the raw + observations in $\vec{y}$: +While the first group of columns takes all observations of the same time of + day over a horizon of, for example, one week ($n_h=7$), the second group + takes all observations covering a pre-defined time horizon, for example + $3$ hours ($n_r=3$ for 60-minute time steps), preceding the time step to + be fitted. +Thus, we exploit the two-dimensional structure of time tables as well, and + conceptually model historical and recent demand. +The alternative feature matrix appears as follows where the first three + columns are the historical and the last three the recent demand features: + +$$ +\vec{y} += +\begin{pmatrix} + y_T \\ + y_{T-1} \\ + \dots \\ + y_{1+n_hH} +\end{pmatrix} +~~~~~ +\mat{X} += +\begin{bmatrix} + y_{T-H} & y_{T-2H} & \dots & y_{T-n_hH} + & y_{T-1} & y_{T-2} & \dots & y_{T-n_r} \\ + y_{T-1-H} & y_{T-1-2H} & \dots & y_{T-1-n_hH} + & y_{T-2} & y_{T-3} & \dots & y_{T-n_r-1} \\ + \dots & \dots & \dots & \dots + & \dots & \dots & \dots & \dots \\ + y_{1+(n_h-1)H} & y_{1+(n_h-2)H} & \dots & y_1 + & y^*_{1+n_hH-1} & y^*_{1+n_hH-2} & \dots & y^*_{1+n_hH-n_r} +\end{bmatrix} +$$ +\ + +Being a detail, we note that the recent demand features lying on the end of + the previous day are set to $0$, which is shown with the $^*$ notation + above. +This alignment of the undecomposed order data $y_t$ ensures that the ML + models learn the two seasonal patterns independently. +The parameters $n_h$ and $n_r$ must be adapted to the data, but we found the + above values to work well. + +As such matrices resemble time tables, we refer to them as tabular. +However, we found the ML models with vertical time series to outperform the + tabular ML models, which is why we disregarded them in the study. +This tabular form could be beneficial for UDPs with a demand that exhibits + a weaker seasonality such as a meal delivery platform. diff --git a/tex/glossary.tex b/tex/glossary.tex index 6c685f7..2c270f3 100644 --- a/tex/glossary.tex +++ b/tex/glossary.tex @@ -5,6 +5,9 @@ \newglossaryentry{cv}{ name=CV, description={Cross Validation} } +\newglossaryentry{mase}{ + name=MASE, description={Mean Absolute Scaled Error} +} \newglossaryentry{ml}{ name=ML, description={Machine Learning} } @@ -27,4 +30,103 @@ name=VRP, description={Vehicle Routing Problem} } +% Model names. +\newglossaryentry{naive}{ + name=naive, description={(Seasonal) Na\"{i}ve Method} +} +\newglossaryentry{fnaive}{ + name=fnaive, description={"Flexible" STL Decomposition, + with tuned ns parameter} +} +\newglossaryentry{pnaive}{ + name=pnaive, description={"Periodic" STL Decomposition, + with ns parameter set to large number} +} +\newglossaryentry{trivial}{ + name=trivial, description={Trivial Method} +} +\newglossaryentry{hcroston}{ + name=hcroston, description={Croston's Method, + trained on horizontal time series} +} +\newglossaryentry{hholt}{ + name=hholt, description={Holt's Linear Trend Method, + trained on horizontal time series} +} +\newglossaryentry{vholt}{ + name=vholt, description={Holt's Linear Trend Method, + trained on vertical time series} +} +\newglossaryentry{rtholt}{ + name=rtholt, description={Holt's Linear Trend Method, + (re)trained on vertical time series} +} +\newglossaryentry{hhwinters}{ + name=hhwinters, description={Holt-Winter's Seasonal Method, + trained on horizontal time series} +} +\newglossaryentry{hses}{ + name=hses, description={Simple Exponential Smoothing Method, + trained on horizontal time series} +} +\newglossaryentry{vses}{ + name=vses, description={Simple Exponential Smoothing Method, + trained on vertical time series} +} +\newglossaryentry{rtses}{ + name=rtses, description={Simple Exponential Smoothing Method, + (re)trained on vertical time series} +} +\newglossaryentry{hsma}{ + name=hsma, description={Simple Moving Average Method, + trained on horizontal time series} +} +\newglossaryentry{htheta}{ + name=htheta, description={Theta Method, + trained on horizontal time series} +} +\newglossaryentry{vtheta}{ + name=vtheta, description={Theta Method, + trained on vertical time series} +} +\newglossaryentry{rttheta}{ + name=rttheta, description={Theta Method, + (re)trained on vertical time series} +} +\newglossaryentry{hets}{ + name=hets, description={ETS State Space Method, + trained on horizontal time series} +} +\newglossaryentry{vets}{ + name=vets, description={ETS State Space Method, + trained on vertical time series} +} +\newglossaryentry{rtets}{ + name=rtets, description={ETS State Space Method, + (re)trained on vertical time series} +} +\newglossaryentry{harima}{ + name=harima, description={Autoregressive Integrated Moving Average + Method, + trained on horizontal time series} +} +\newglossaryentry{varima}{ + name=varima, description={Autoregressive Integrated Moving Average + Method, + trained on vertical time series} +} +\newglossaryentry{rtarima}{ + name=rtarima, description={Autoregressive Integrated Moving Average + Method, + (re)trained on vertical time series} +} +\newglossaryentry{vrfr}{ + name=vrfr, description={Random Forest Regression Method, + trained on vertical time series} +} +\newglossaryentry{vsvr}{ + name=vsvr, description={Support Vector Regression Method, + trained on vertical time series} +} + \printglossaries \ No newline at end of file diff --git a/tex/preamble.tex b/tex/preamble.tex index 85ddd10..0a2dffc 100644 --- a/tex/preamble.tex +++ b/tex/preamble.tex @@ -4,6 +4,12 @@ \usepackage[acronym]{glossaries} \makeglossaries +% Enable captions for figures and tables. +\usepackage{caption} + +% Enable diagonal lines in tables. +\usepackage{static/slashbox} + % Make opening quotes look different than closing quotes. \usepackage[english=american]{csquotes} \MakeOuterQuote{"} diff --git a/tex/references.bib b/tex/references.bib index 9d330ff..05fd42b 100644 --- a/tex/references.bib +++ b/tex/references.bib @@ -26,6 +26,16 @@ volume={1}, pages={461--466} } +@article{barbour2014, +title={psd: Adaptive, sine multitaper power spectral density estimation for R}, +author={Barbour, Andrew J and Parker, Robert L}, +year={2014}, +journal={Computers \& Geosciences}, +volume={63}, +pages={1--8}, +publisher={Elsevier} +} + @misc{bell2018, title = {Forecasting at Uber: An Introduction}, author={Bell, Franziska and Smyl, Slawek}, @@ -139,6 +149,16 @@ number={1}, pages={3--73} } +@article{croston1972, +title={Forecasting and Stock Control for intermittent Demands}, +author={Croston, J Do}, +year={1972}, +journal={Journal of the Operational Research Society}, +volume={23}, +number={3}, +pages={289--303} +} + @book{dagum2016, title={Seasonal Adjustment Methods and Real Time Trend-Cycle Estimation}, author={Dagum, Estela and Bianconcini, Silvia}, @@ -275,6 +295,17 @@ number={2}, pages={287--290} } +@article{hyndman2006, +title={Another Look at Measures of Forecast Accuracy}, +author={Hyndman, Rob and Koehler, Anne}, +year={2006}, +journal={International Journal of Forecasting}, +volume={22}, +number={4}, +pages={679--688}, +publisher={Elsevier} +} + @article{hyndman2008a, title={Automatic Time Series Forecasting: The forecast package for R}, author={Hyndman, Rob and Khandakar, Yeasmin}, @@ -298,6 +329,18 @@ year={2018}, publisher={OTexts} } +@article{kim2016, +title={A new Metric of Absolute Percentage Error for Intermittent Demand + Forecasts}, +author={Kim, Sungil and Kim, Heeyoung}, +year={2016}, +journal={International Journal of Forecasting}, +volume={32}, +number={3}, +pages={669--679}, +publisher={Elsevier} +} + @article{kwiatkowski1992, title={Testing the null hypothesis of stationarity against the alternative of a unit root: How sure are we that economic time series have a unit root?}, @@ -319,6 +362,17 @@ howpublished = {\url{https://eng.uber.com/neural-networks/}}, note = {Accessed: 2020-10-01} } +@article{ma2018, +title={Using the Gradient Boosting Decision Tree to Improve the Delineation of + Hourly Rain Areas during the Summer from Advanced Himawari Imager Data}, +author={Ma, Liang and Zhang, Guoping and Lu, Er}, +year={2018}, +journal={Journal of Hydrometeorology}, +volume={19}, +number={5}, +pages={761-776}, +} + @article{masmoudi2018, title={The dial-a-ride problem with electric vehicles and battery swapping stations}, @@ -330,6 +384,15 @@ volume={118}, pages={392--420} } +@inproceedings{mason2000, +title={Boosting algorithms as gradient descent}, +author={Mason, Llew and Baxter, Jonathan and Bartlett, Peter L + and Frean, Marcus R}, +year={2000}, +booktitle={Advances in neural information processing systems}, +pages={512--518} +} + @inproceedings{mueller1997, title={Predicting Time Series with Support Vector Machines}, author={M{\"u}ller, Klaus-Robert and Smola, Alexander and R{\"a}tsch, Gunnar @@ -367,6 +430,18 @@ number={5}, pages={311--315} } +@article{prestwich2014, +title={Mean-based Error Measures for Intermittent Demand Forecasting}, +author={Prestwich, Steven and Rossi, Roberto and Tarim, Armagan + and Hnich, Brahim}, +year={2014}, +journal={International Journal of Production Research}, +volume={52}, +number={22}, +pages={6782--6791}, +publisher={Taylor \& Francis} +} + @incollection{scholkopf1998, title={Fast Approximation of Support Vector Kernel Expansions, and an Interpretation of Clustering as Approximation in Feature Spaces}, @@ -378,6 +453,13 @@ publisher={Springer}, pages={125--132} } +@book{singleton2017, +title={Urban Analytics}, +author={Singleton, Alex David and Spielman, Seth and Folch, David}, +year={2017}, +publisher={Sage} +} + @article{smola2004, title={A Tutorial on Support Vector Regression}, author={Smola, Alex and Sch{\"o}lkopf, Bernhard}, @@ -398,6 +480,17 @@ pages={285--292}, publisher={MIT, Cambridge, MA, USA} } +@article{syntetos2005, +title={The Accuracy of Intermittent Demand Estimates}, +author={Syntetos, Aris and Boylan, John}, +year={2005}, +journal={International Journal of forecasting}, +volume={21}, +number={2}, +pages={303--314}, +publisher={Elsevier} +} + @article{taylor2003, title={Exponential Smoothing with a Damped Multiplicative Trend}, author={Taylor, James}, @@ -442,6 +535,18 @@ volume={118}, pages={496--512} } +@article{winkenbach2015, +title={Enabling urban logistics services at La Poste through + multi-echelon location-routing}, +author={Winkenbach, Matthias and Kleindorfer, Paul R and Spinler, Stefan}, +year={2015}, +journal={Transportation Science}, +volume={50}, +number={2}, +pages={520--540}, +publisher={INFORMS} +} + @article{winters1960, title={Forecasting Sales by Exponentially Weighted Moving Averages}, author={Winters, Peter},