From f010a69af21301379d5c2345494ab43dacbe5a8b Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Sun, 4 Oct 2020 23:58:46 +0200 Subject: [PATCH] Add Study section --- paper.tex | 11 +- tex/4_stu/1_intro.tex | 5 +- tex/4_stu/2_data.tex | 23 +++ tex/4_stu/3_params.tex | 37 +++++ tex/4_stu/4_overall.tex | 238 +++++++++++++++++++++++++++++++ tex/4_stu/5_training.tex | 31 ++++ tex/4_stu/6_fams.tex | 162 +++++++++++++++++++++ tex/4_stu/7_pixels_intervals.tex | 27 ++++ tex/apx/peak_results.tex | 13 +- tex/glossary.tex | 3 + tex/preamble.tex | 6 +- 11 files changed, 544 insertions(+), 12 deletions(-) create mode 100644 tex/4_stu/2_data.tex create mode 100644 tex/4_stu/3_params.tex create mode 100644 tex/4_stu/4_overall.tex create mode 100644 tex/4_stu/5_training.tex create mode 100644 tex/4_stu/6_fams.tex create mode 100644 tex/4_stu/7_pixels_intervals.tex diff --git a/paper.tex b/paper.tex index 78c1cd5..0cded62 100644 --- a/paper.tex +++ b/paper.tex @@ -30,8 +30,13 @@ \input{tex/3_mod/7_models/4_rt} \input{tex/3_mod/7_models/5_ml} \input{tex/4_stu/1_intro} +\input{tex/4_stu/2_data} +\input{tex/4_stu/3_params} +\input{tex/4_stu/4_overall} +\input{tex/4_stu/5_training} +\input{tex/4_stu/6_fams} +\input{tex/4_stu/7_pixels_intervals} \input{tex/5_con/1_intro} - \newpage \input{tex/glossary} @@ -43,6 +48,10 @@ \newpage \input{tex/apx/enhanced_feats} \newpage +\input{tex/apx/case_study} +\newpage +\input{tex/apx/peak_results} +\newpage \bibliographystyle{static/elsarticle-harv} \bibliography{tex/references} diff --git a/tex/4_stu/1_intro.tex b/tex/4_stu/1_intro.tex index 29411c5..f57ddb1 100644 --- a/tex/4_stu/1_intro.tex +++ b/tex/4_stu/1_intro.tex @@ -1,5 +1,6 @@ \section{Empirical Study: A Meal Delivery Platform in Europe} \label{stu} -% temporary placeholder -\label{params} \ No newline at end of file +In the following, we first give a brief overview of the case study dataset + and the parameters we applied to calibrate the time series generation. +Then, we discuss the overall results. diff --git a/tex/4_stu/2_data.tex b/tex/4_stu/2_data.tex new file mode 100644 index 0000000..53e075e --- /dev/null +++ b/tex/4_stu/2_data.tex @@ -0,0 +1,23 @@ +\subsection{Case Study Dataset} +\label{data} + +The studied dataset consists of a meal delivery platform's entire + transactional data covering the French market from launch in February of + 2016 to January of 2017. +The platform operated in five cities throughout this period and received a + total of 686,385 orders. +The forecasting models were developed based on the data from Lyon and Paris in + the period from August through December; this ensures comparability across + cities and avoids irregularities in demand assumed for a new service + within the first operating weeks. +The data exhibit a steady-state as the UDP's service area remained + unchanged, and the numbers of orders and of couriers grew in lock-step and + organically. +This does not mean that no new restaurants were openend: If that happened, the + new restaurant did not attract new customers, but demand was shifted from + other member restaurants. +Results are similar in both cities, and we only report them for Paris for + greater conciseness. +Lastly, the platform recorded all incoming orders, and lost demand does not + exist. +See \ref{dataset} for details on the raw data. diff --git a/tex/4_stu/3_params.tex b/tex/4_stu/3_params.tex new file mode 100644 index 0000000..329075a --- /dev/null +++ b/tex/4_stu/3_params.tex @@ -0,0 +1,37 @@ +\subsection{Calibration of the Time Series Generation Process} +\label{params} + +Independent of the concrete forecasting models, the time series generation + must be calibrated. +We concentrate our forecasts on the pickup side for two reasons. +First, the restaurants come in a significantly lower number than the + customers resulting in more aggregation in the order counts and thus a + better pattern recognition. +Second, from an operational point of view, forecasts for the pickups are more + valuable because of the waiting times due to meal preparation. +We choose pixel sizes of $0.5~\text{km}^2$, $1~\text{km}^2$, $2~\text{km}^2$, + and $4~\text{km}^2$, and time steps covering 60, 90, and 120 minute windows + resulting in $H_{60}=12$, $H_{90}=9$, and $H_{120}=6$ time steps per day + with the platform operating between 11 a.m. and 11 p.m. and corresponding + frequencies $k_{60}=7*12=84$, $k_{90}=7*9=63$, and $k_{120}=7*6=42$ for the + vertical time series. +Smaller pixels and shorter time steps yield no recognizable patterns, yet would + have been more beneficial for tactical routing. +90 and 120 minute time steps are most likely not desirable for routing; however, + we keep them for comparison and note that a UDP may employ such forecasts + to activate more couriers at short notice if a (too) high demand is + forecasted in an hour from now. +This could, for example, be implemented by paying couriers a premium if they + show up for work at short notice. +Discrete lengths of 3, 4, 5, 6, 7, and 8 weeks are chosen as training + horizons. +We do so as the structure within the pixels (i.e., number and kind of + restaurants) is not stable for more than two months in a row in the + covered horizon. +That is confirmed by the empirical finding that forecasting accuracy + improves with longer training horizon but this effect starts to + level off after about six to seven weeks. +So, the demand patterns of more than two months ago do not resemble more + recent ones. + +In total, 100,000s of distinct time series are forecast in the study. diff --git a/tex/4_stu/4_overall.tex b/tex/4_stu/4_overall.tex new file mode 100644 index 0000000..dff9559 --- /dev/null +++ b/tex/4_stu/4_overall.tex @@ -0,0 +1,238 @@ +\subsection{Overall Results} +\label{overall_results} + +Table \ref{t:results} summarizes the overall best-performing models grouped by + training horizon and a pixel's average daily demand (\gls{add}) for a + pixel size of $1~\text{km}^2$ and 60-minute time steps. +Each combination of pixel and test day counts as one case, and the total + number of cases is denoted as $n$. +Clustering the individual results revealed that a pixel's ADD over the + training horizon is the primary indicator of similarity and three to four + clusters suffice to obtain cohesive clusters: +We labeled them "no", "low", "medium", and "high" demand pixels with + increasing ADD, and present the average MASE per cluster. +The $n$ do not vary significantly across the training horizons, which confirms + that the platform did not grow area-wise and is indeed in a steady-state. +We use this table to answer \textbf{Q1} regarding the overall best methods + under different ADDs. +All result tables in the main text report MASEs calculated with all time + steps of a day. +In contrast, \ref{peak_results} shows the same tables with MASEs calculated + with time steps within peak times only (i.e., lunch from 12 pm to 2 pm and + dinner from 6 pm to 8 pm). +The differences lie mainly in the decimals of the individual MASE + averages while the ranks of the forecasting methods do not change except + in rare cases. +That shows that the presented accuracies are driven by the forecasting methods' + accuracies at peak times. +Intuitively, they all correctly predict zero demand for non-peak times. + +Unsurprisingly, the best model for pixels without demand (i.e., + $0 < \text{ADD} < 2.5$) is \textit{trivial}. +Whereas \textit{hsma} also adapts well, its performance is worse. +None of the more sophisticated models reaches a similar accuracy. +The intuition behind is that \textit{trivial} is the least distorted by the + relatively large proportion of noise given the low-count nature of the + time series. + +For low demand (i.e., $2.5 < \text{ADD} < 10$), there is also a clear + best-performing model, namely \textit{hsma}. +As the non-seasonal \textit{hses} reaches a similar accuracy as its + potentially seasonal generalization, the \textit{hets}, we conclude that + the seasonal pattern from weekdays is not yet strong enough to be + recognized in low demand pixels. +So, in the absence of seasonality, models that only model a trend part are + the least susceptible to the noise. + +For medium demand (i.e., $10 < \text{ADD} < 25$) and training horizons up to + six weeks, the best-performing models are the same as for low demand. +For longer horizons, \textit{hets} provides the highest accuracy. +Thus, to fit a seasonal pattern, longer training horizons are needed. +While \textit{vsvr} enters the top three, \textit{hets} has the edge as they + neither require parameter tuning nor real-time data. + +\begin{center} +\captionof{table}{Top-3 models by training weeks and average demand + ($1~\text{km}^2$ pixel size, 60-minute time steps)} +\label{t:results} +\begin{tabular}{|c|c|*{12}{c|}} + +\hline +\multirow{3}{*}{\rotatebox{90}{\thead{Training}}} + & \multirow{3}{*}{\rotatebox{90}{\thead{Rank}}} + & \multicolumn{3}{c|}{\thead{No Demand}} + & \multicolumn{3}{c|}{\thead{Low Demand}} + & \multicolumn{3}{c|}{\thead{Medium Demand}} + & \multicolumn{3}{c|}{\thead{High Demand}} \\ +~ & ~ + & \multicolumn{3}{c|}{(0 - 2.5)} + & \multicolumn{3}{c|}{(2.5 - 10)} + & \multicolumn{3}{c|}{(10 - 25)} + & \multicolumn{3}{c|}{(25 - $\infty$)} \\ +\cline{3-14} +~ & ~ + & Method & MASE & $n$ + & Method & MASE & $n$ + & Method & MASE & $n$ + & Method & MASE & $n$ \\ + +\hline \hline +\multirow{3}{*}{3} & 1 + & \textbf{\textit{trivial}} + & 0.785 & \multirow{3}{*}{\rotatebox{90}{4586}} + & \textbf{\textit{hsma}} + & 0.819 & \multirow{3}{*}{\rotatebox{90}{2975}} + & \textbf{\textit{hsma}} + & 0.839 & \multirow{3}{*}{\rotatebox{90}{2743}} + & \textbf{\textit{rtarima}} + & 0.872 & \multirow{3}{*}{\rotatebox{90}{2018}} \\ +~ & 2 + & \textit{hsma} & 0.809 & ~ + & \textit{hses} & 0.844 & ~ + & \textit{hses} & 0.858 & ~ + & \textit{rtses} & 0.873 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.958 & ~ + & \textit{hets} & 0.846 & ~ + & \textit{hets} & 0.859 & ~ + & \textit{rtets} & 0.877 & ~ \\ + +\hline +\multirow{3}{*}{4} & 1 + & \textbf{\textit{trivial}} + & 0.770 & \multirow{3}{*}{\rotatebox{90}{4532}} + & \textbf{\textit{hsma}} + & 0.825 & \multirow{3}{*}{\rotatebox{90}{3033}} + & \textbf{\textit{hsma}} + & 0.837 & \multirow{3}{*}{\rotatebox{90}{2687}} + & \textbf{\textit{vrfr}} + & 0.855 & \multirow{3}{*}{\rotatebox{90}{2016}} \\ +~ & 2 + & \textit{hsma} & 0.788 & ~ + & \textit{hses} & 0.848 & ~ + & \textit{hses} & 0.850 & ~ + & \textbf{\textit{rtarima}} & 0.855 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.917 & ~ + & \textit{hets} & 0.851 & ~ + & \textit{hets} & 0.854 & ~ + & \textit{rtses} & 0.860 & ~ \\ + +\hline +\multirow{3}{*}{5} & 1 + & \textbf{\textit{trivial}} + & 0.780 & \multirow{3}{*}{\rotatebox{90}{4527}} + & \textbf{\textit{hsma}} + & 0.841 & \multirow{3}{*}{\rotatebox{90}{3055}} + & \textbf{\textit{hsma}} + & 0.837 & \multirow{3}{*}{\rotatebox{90}{2662}} + & \textbf{\textit{vrfr}} + & 0.850 & \multirow{3}{*}{\rotatebox{90}{2019}} \\ +~ & 2 + & \textit{hsma} & 0.803 & ~ + & \textit{hses} & 0.859 & ~ + & \textit{hets} & 0.845 & ~ + & \textbf{\textit{rtarima}} & 0.852 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.889 & ~ + & \textit{hets} & 0.861 & ~ + & \textit{hses} & 0.845 & ~ + & \textit{vsvr} & 0.854 & ~ \\ + +\hline +\multirow{3}{*}{6} & 1 + & \textbf{\textit{trivial}} + & 0.741 & \multirow{3}{*}{\rotatebox{90}{4470}} + & \textbf{\textit{hsma}} + & 0.847 & \multirow{3}{*}{\rotatebox{90}{3086}} + & \textbf{\textit{hsma}} + & 0.840 & \multirow{3}{*}{\rotatebox{90}{2625}} + & \textbf{\textit{vrfr}} + & 0.842 & \multirow{3}{*}{\rotatebox{90}{2025}} \\ +~ & 2 + & \textit{hsma} & 0.766 & ~ + & \textit{hses} & 0.863 & ~ + & \textit{hets} & 0.842 & ~ + & \textbf{\textit{hets}} & 0.847 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.837 & ~ + & \textit{hets} & 0.865 & ~ + & \textit{hses} & 0.848 & ~ + & \textit{vsvr} & 0.848 & ~ \\ + +\hline +\multirow{3}{*}{7} & 1 + & \textbf{\textit{trivial}} + & 0.730 & \multirow{3}{*}{\rotatebox{90}{4454}} + & \textbf{\textit{hsma}} + & 0.858 & \multirow{3}{*}{\rotatebox{90}{3132}} + & \textbf{\textit{hets}} + & 0.845 & \multirow{3}{*}{\rotatebox{90}{2597}} + & \textbf{\textit{hets}} + & 0.840 & \multirow{3}{*}{\rotatebox{90}{2007}} \\ +~ & 2 + & \textit{hsma} & 0.754 & ~ + & \textit{hses} & 0.871 & ~ + & \textit{hsma} & 0.847 & ~ + & \textbf{\textit{vrfr}} & 0.845 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.813 & ~ + & \textit{hets} & 0.872 & ~ + & \textbf{\textit{vsvr}} & 0.850 & ~ + & \textit{vsvr} & 0.847 & ~ \\ + +\hline +\multirow{3}{*}{8} & 1 + & \textbf{\textit{trivial}} + & 0.735 & \multirow{3}{*}{\rotatebox{90}{4402}} + & \textbf{\textit{hsma}} + & 0.867 & \multirow{3}{*}{\rotatebox{90}{3159}} + & \textbf{\textit{hets}} + & 0.846 & \multirow{3}{*}{\rotatebox{90}{2575}} + & \textbf{\textit{hets}} + & 0.836 & \multirow{3}{*}{\rotatebox{90}{2002}} \\ +~ & 2 + & \textit{hsma} & 0.758 & ~ + & \textit{hets} & 0.877 & ~ + & \textbf{\textit{vsvr}} & 0.850 & ~ + & \textbf{\textit{vrfr}} & 0.842 & ~ \\ +~ & 3 + & \textit{pnaive} & 0.811 & ~ + & \textit{hses} & 0.880 & ~ + & \textit{hsma} & 0.851 & ~ + & \textit{vsvr} & 0.849 & ~ \\ + +\hline +\end{tabular} +\end{center} + +In summary, except for high demand, simple models trained on horizontal time + series work best. +By contrast, high demand (i.e., $25 < \text{ADD} < \infty$) and less than + six training weeks is the only situation where classical models trained on + vertical time series work well. +Then, \textit{rtarima} outperforms their siblings from Sub-sections + \ref{vert} and \ref{rt}. +We conjecture that intra-day auto-correlations as caused, for example, by + weather, are the reason for that. +Intuitively, a certain amount of demand (i.e., a high enough signal-to-noise + ratio) is required such that models with auto-correlations can see them + through all the noise. +That idea is supported by \textit{vrfr} reaching a similar accuracy under + high demand as their tree-structure allows them to fit auto-correlations. +As both \textit{rtarima} and \textit{vrfr} incorporate recent demand, + real-time information can indeed improve accuracy. +However, once models are trained on longer horizons, \textit{hets} is more + accurate than \textit{vrfr}. +Thus, to answer \textbf{Q4}, we conclude that real-time information only + improves accuracy if three or four weeks of training material are + available. + +In addition to looking at the results in tables covering the entire one-year + horizon, we also created sub-analyses on the distinct seasons spring, + summer (incl. the long holiday season in France), and fall. +Yet, none of the results portrayed in this and the subsequent sections change + is significant ways. +We conjecture that there could be differences if the overall demand of the UDP + increased to a scale beyond the one this case study covers and leave that + up to a follow-up study with a bigger UDP. diff --git a/tex/4_stu/5_training.tex b/tex/4_stu/5_training.tex new file mode 100644 index 0000000..ea5320d --- /dev/null +++ b/tex/4_stu/5_training.tex @@ -0,0 +1,31 @@ +\subsection{Impact of the Training Horizon} +\label{training} + +Whereas it is reasonable to assume that forecasts become more accurate as the + training horizon expands, our study reveals some interesting findings. +First, without demand, \textit{trivial} indeed performs better with more + training material, but improved pattern recognition cannot be the cause + here. +Instead, we argue that the reason for this is that the longer there has been + no steady demand, the higher the chance that this will not change soon. +Further, if we focus on shorter training horizons, the sample will necessarily + contain cases where a pixel is initiated after a popular-to-be restaurant + joined the platform: +Demand grows fast making \textit{trivial} less accurate, and the pixel moves + to another cluster soon. + +Second, with low demand, the best-performing \textit{hsma} becomes less + accurate with more training material. +While one could argue that this is due to \textit{hsma} not fitting a trend, + the less accurate \textit{hses} and \textit{hets} do fit a trend. +Instead, we argue that any low-demand time series naturally exhibits a high + noise-to-signal ratio, and \textit{hsma} is the least susceptible to + noise. +Then, to counter the missing trend term, the training horizon must be shorter. + +With medium demand, a similar argument can be made; however, the + signal already becomes more apparent favoring \textit{hets} with more + training data. + +Lastly, with high demand, the signal becomes so clear that more sophisticated + models can exploit longer training horizons. diff --git a/tex/4_stu/6_fams.tex b/tex/4_stu/6_fams.tex new file mode 100644 index 0000000..8fdd17d --- /dev/null +++ b/tex/4_stu/6_fams.tex @@ -0,0 +1,162 @@ +\subsection{Results by Model Families} +\label{fams} + +Besides the overall results, we provide an in-depth comparison of models + within a family. +Instead of reporting the MASE per model, we rank the models holding the + training horizon fixed to make comparison easier. +Table \ref{t:hori} presents the models trained on horizontal time series. +In addition to \textit{naive}, we include \textit{fnaive} and \textit{pnaive} + already here as more competitive benchmarks. +The tables in this section report two rankings simultaneously: +The first number is the rank resulting from lumping the low and medium + clusters together, which yields almost the same rankings when analyzed + individually. +The ranks from only high demand pixels are in parentheses if they differ. + +\begin{center} +\captionof{table}{Ranking of benchmark and horizontal models + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:hori} +\begin{tabular}{|c|ccc|cccccccc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{3}{c|}{\thead{Benchmarks}} + & \multicolumn{8}{c|}{\thead{Horizontal (whole-day-ahead)}} \\ +\cline{2-12} +~ & \textit{naive} & \textit{fnaive} & \textit{paive} + & \textit{harima} & \textit{hcroston} & \textit{hets} & \textit{hholt} + & \textit{hhwinters} & \textit{hses} & \textit{hsma} & \textit{htheta} \\ +\hline \hline +3 & 11 & 7 (2) & 8 (5) & 5 (7) & 4 & 3 + & 9 (10) & 10 (9) & 2 (6) & 1 & 6 (8) \\ +4 & 11 & 7 (2) & 8 (3) & 5 (6) & 4 (5) & 3 (1) + & 9 (10) & 10 (9) & 2 (7) & 1 (4) & 6 (8) \\ +5 & 11 & 7 (2) & 8 (4) & 5 (3) & 4 (9) & 3 (1) + & 9 (10) & 10 (5) & 2 (8) & 1 (6) & 6 (7) \\ +6 & 11 & 8 (5) & 9 (6) & 5 (4) & 4 (7) & 2 (1) + & 10 & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +7 & 11 & 8 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1) + & 9 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +8 & 11 & 9 (5) & 10 (6) & 5 (4) & 4 (7) & 2 (1) + & 8 (10) & 7 (2) & 3 (8) & 1 (9) & 6 (3) \\ +\hline +\end{tabular} +\end{center} +\ + +A first insight is that \textit{fnaive} is the best benchmark in all + scenarios: +Decomposing flexibly by tuning the $ns$ parameter is worth the computational + cost. +Further, if one is limited in the number of non-na\"{i}ve methods, + \textit{hets} is the best compromise and works well across all demand + levels. +It is also the best model independent of the training horizon for high demand. +With low or medium demand, \textit{hsma} is the clear overall winner; yet, + with high demand, models with a seasonal fit (i.e., \textit{harima}, + \textit{hets}, and \textit{hhwinters}) are more accurate, in particular, + for longer training horizons. +This is due to demand patterns in the weekdays becoming stronger with higher + overall demand. + +\begin{center} +\captionof{table}{Ranking of classical models on vertical time series + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:vert} +\begin{tabular}{|c|cc|ccccc|ccccc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{2}{c|}{\thead{Benchmarks}} + & \multicolumn{5}{c|}{\thead{Vertical (whole-day-ahead)}} + & \multicolumn{5}{c|}{\thead{Vertical (real-time)}} \\ +\cline{2-13} +~ & \textit{hets} & \textit{hsma} & \textit{varima} & \textit{vets} + & \textit{vholt} & \textit{vses} & \textit{vtheta} & \textit{rtarima} + & \textit{rtets} & \textit{rtholt} & \textit{rtses} & \textit{rttheta} \\ +\hline \hline +3 & 2 (10) & 1 (7) & 6 (4) & 8 (6) & 10 (9) + & 7 (5) & 11 (12) & 4 (1) & 5 (3) & 9 (8) & 3 (2) & 12 (11) \\ +4 & 2 (8) & 1 (10) & 6 (4) & 8 (6) & 10 (9) + & 7 (5) & 12 (11) & 3 (1) & 5 (3) & 9 (7) & 4 (2) & 11 (12) \\ +5 & 2 (3) & 1 (10) & 7 (5) & 8 (7) & 10 (9) + & 6 & 11 & 4 (1) & 5 (4) & 9 (8) & 3 (2) & 12 \\ +6 & 2 (1) & 1 (10) & 6 (5) & 8 (7) & 10 (9) + & 7 (6) & 11 (12) & 3 (2) & 5 (4) & 9 (8) & 4 (3) & 12 (11) \\ +7 & 2 (1) & 1 (10) & 8 (5) & 7 & 10 (9) + & 6 & 11 (12) & 5 (2) & 4 & 9 (8) & 3 & 12 (11) \\ +8 & 2 (1) & 1 (9) & 8 (5) & 7 (6) & 10 (8) + & 6 & 12 (10) & 5 (2) & 4 & 9 (7) & 3 & 11 \\ +\hline +\end{tabular} +\end{center} +\ + +Table \ref{t:vert} extends the previous analysis to classical models trained + on vertical time series. +Now, the winners from before, \textit{hets} and \textit{hsma}, serve as + benchmarks. +Whereas for low and medium demand, no improvements can be obtained, + \textit{rtarima} and \textit{rtses} are the most accurate with high demand + and short training horizons. +For six or more training weeks, \textit{hets} is still optimal. +Independent of retraining and the demand level, the models' relative + performances are consistent: +The \textit{*arima} and \textit{*ses} models are best, followed by + \textit{*ets}, \textit{*holt}, and \textit{*theta}. +Thus, models that can deal with auto-correlations and short-term forecasting + errors, as expressed by moving averages, and that cannot be distracted by + trend terms are optimal for vertical series. + +Finally, Table \ref{t:ml} compares the two ML-based models against the + best-performing classical models and answers \textbf{Q2}: +With low and medium demand, no improvements can be obtained again; however, + with high demand, \textit{vrfr} has the edge over \textit{rtarima} for + training horizons up to six weeks. +We conjecture that \textit{vrfr} fits auto-correlations better than + \textit{varima} and is not distracted by short-term noise as + \textit{rtarima} may be due to the retraining. +With seven or eight training weeks, \textit{hets} remains the overall winner. +Interestingly, \textit{vsvr} is more accurate than \textit{vrfr} for low and + medium demand. +We assume that \textit{vrfr} performs well only with strong auto-correlations, + which are not present with low and medium demand. + +\begin{center} +\captionof{table}{Ranking of ML models on vertical time series + ($1~\text{km}^2$ pixel size, 60-minute time steps): + the table shows the ranks for cases with $2.5 < ADD < 25$ + (and $25 < ADD < \infty$ in parentheses if they differ)} +\label{t:ml} +\begin{tabular}{|c|cccc|cc|} +\hline +\multirow{2}{*}{\rotatebox{90}{\thead{\scriptsize{Training}}}} + & \multicolumn{4}{c|}{\thead{Benchmarks}} + & \multicolumn{2}{c|}{\thead{ML}} \\ +\cline{2-7} +~ & \textit{fnaive} & \textit{hets} & \textit{hsma} + & \textit{rtarima} & \textit{vrfr} & \textit{vsvr} \\ +\hline \hline +3 & 6 & 2 (5) & 1 (4) & 3 (1) & 5 (2) & 4 (3) \\ +4 & 6 (5) & 2 (4) & 1 (6) & 3 (2) & 5 (1) & 4 (3) \\ +5 & 6 (5) & 2 (4) & 1 (6) & 4 (2) & 5 (1) & 3 \\ +6 & 6 (5) & 2 & 1 (6) & 4 & 5 (1) & 3 \\ +7 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\ +8 & 6 (5) & 2 (1) & 1 (6) & 4 & 5 (2) & 3 \\ +\hline +\end{tabular} +\end{center} +\ + +Analogously, we created tables like Table \ref{t:hori} to \ref{t:ml} for the + forecasts with time steps of 90 and 120 minutes and find that the relative + rankings do not change significantly. +The same holds true for the rankings with changing pixel sizes. +For conciseness reasons, we do not include these additional tables in this + article. +In summary, the relative performances exhibited by certain model families + are shown to be rather stable in this case study. diff --git a/tex/4_stu/7_pixels_intervals.tex b/tex/4_stu/7_pixels_intervals.tex new file mode 100644 index 0000000..8f60041 --- /dev/null +++ b/tex/4_stu/7_pixels_intervals.tex @@ -0,0 +1,27 @@ +\subsection{Effects of the Pixel Size and Time Step Length} +\label{pixels_intervals} + +As elaborated in Sub-section \ref{grid}, more order aggregation leads to a + higher overall demand level and an improved pattern recognition in the + generated time series. +Consequently, individual cases tend to move to the right in tables equivalent + to Table \ref{t:results}. +With the same $ADD$ clusters, forecasts for pixel sizes of $2~\text{km}^2$ and + $4~\text{km}^2$ or time intervals of 90 and 120 minutes or combinations + thereof yield results similar to the best models as revealed in Tables + \ref{t:results}, \ref{t:hori}, \ref{t:vert}, and \ref{t:ml} for high + demand. +By contrast, forecasts for $0.5~\text{km}^2$ pixels have most of the cases + (i.e., $n$) in the no or low demand clusters. +In that case, the pixels are too small, and pattern recognition becomes + harder. +While it is true, that \textit{trivial} exhibits the overall lowest MASE + for no demand cases, these forecasts become effectively worthless for + operations. +In the extreme, with even smaller pixels we would be forecasting $0$ orders + in all pixels for all time steps. +In summary, the best model and its accuracy are determined primarily by the + $ADD$, and the pixel size and interval length are merely parameters to + control that. +The forecaster's goal is to create a grid with small enough pixels without + losing a recognizable pattern. diff --git a/tex/apx/peak_results.tex b/tex/apx/peak_results.tex index 58b4dd8..65e5bb6 100644 --- a/tex/apx/peak_results.tex +++ b/tex/apx/peak_results.tex @@ -1,14 +1,11 @@ \section{Forecasting Accuracies during Peak Times} \label{peak_results} -This appendix shows all result tables from the main text with the MASE - averages calculated from time steps within peak times. -Peaks are the times of the day where the typical customer has a lunch or - dinner meal and defined to be either from 12 pm to 2 pm or from 6 pm to - 8 pm. -While the exact decimals of the MASEs differ from the ones in the main - text, the relative ranks of the forecasting methods are the same except in - rare cases. +This appendix shows all tables from the main text + with the MASE averages calculated from time steps within peak times + that are defined to be from 12 pm to 2 pm (=lunch) or from 6 pm to 8 pm (=dinner). +While the exact decimals of the MASEs differ, + the relative ranks of the forecasting methods are the same except in rare cases. \begin{center} \captionof{table}{Top-3 models by training weeks and average demand diff --git a/tex/glossary.tex b/tex/glossary.tex index 2c270f3..ea86d45 100644 --- a/tex/glossary.tex +++ b/tex/glossary.tex @@ -1,4 +1,7 @@ % Abbreviations for technical terms. +\newglossaryentry{add}{ + name=ADD, description={Average Daily Demand} +} \newglossaryentry{cart}{ name=CART, description={Classification and Regression Trees} } diff --git a/tex/preamble.tex b/tex/preamble.tex index 0a2dffc..207afd3 100644 --- a/tex/preamble.tex +++ b/tex/preamble.tex @@ -10,6 +10,9 @@ % Enable diagonal lines in tables. \usepackage{static/slashbox} +% Enable multiple lines in a table row +\usepackage{multirow} + % Make opening quotes look different than closing quotes. \usepackage[english=american]{csquotes} \MakeOuterQuote{"} @@ -17,4 +20,5 @@ % Define helper commands. \usepackage{bm} \newcommand{\mat}[1]{\bm{#1}} -\newcommand{\norm}[1]{\left\lVert#1\right\rVert} \ No newline at end of file +\newcommand{\norm}[1]{\left\lVert#1\right\rVert} +\newcommand{\thead}[1]{\textbf{#1}} \ No newline at end of file