How can we predict weekly departmental revenue for Walmart, leveraging our knowledge of Walmart, its business, and some limited historical information?
This is a real problem posed by Walmart
As such, the data is messy to work with. Real data is rarely clean.
\[ WMAE = \frac{1}{\sum w_i} \sum_{i=1}^{n} w_i \left|y_i-\hat{y}_i\right| \]
We’ll have to fix these
library(tidyverse) # We'll extensively use dplyr here
library(lubridate) # Great for simple date functions
library(broom) # To view results in a tidy manner
weekly <- read_csv("../../Data/WMT_train.csv")
weekly.test <- read_csv("../../Data/WMT_test.csv")
weekly.features <- read_csv("../../Data/WMT_features.csv")
weekly.stores <- read_csv("../../Data/WMT_stores.csv")
weekly
is our training dataweekly.test
is our testing data – no Weekly_Sales
columnweekly.features
is general information about (week, store) pairs
weekly.stores
is general information about each storepreprocess_data <- function(df) {
# Merge the data together (Pulled from outside of function -- "scoping")
df <- inner_join(df, weekly.stores)
df <- inner_join(df, weekly.features[,1:11])
# Compress the weird markdown information to 1 variable
df$markdown <- 0
df[!is.na(df$MarkDown1),]$markdown <- df[!is.na(df$MarkDown1),]$MarkDown1
df[!is.na(df$MarkDown2),]$markdown <- df[!is.na(df$MarkDown2),]$MarkDown2
df[!is.na(df$MarkDown3),]$markdown <- df[!is.na(df$MarkDown3),]$MarkDown3
df[!is.na(df$MarkDown4),]$markdown <- df[!is.na(df$MarkDown4),]$MarkDown4
df[!is.na(df$MarkDown5),]$markdown <- df[!is.na(df$MarkDown5),]$MarkDown5
# Fix dates and add useful time variables
df$date <- as.Date(df$Date)
df$week <- week(df$date)
df$year <- year(df$date)
# Output the data
df
}
df <- preprocess_data(weekly)
df_test <- preprocess_data(weekly.test)
Merge data, fix
markdown
, build time data
Store | date | markdown | MarkDown3 | MarkDown4 | MarkDown5 |
---|---|---|---|---|---|
1 | 2011-10-28 | 0.00 | NA | NA | NA |
1 | 2011-11-04 | 0.00 | NA | NA | NA |
1 | 2011-11-11 | 6551.42 | 215.07 | 2406.62 | 6551.42 |
1 | 2011-11-18 | 5988.57 | 51.98 | 427.39 | 5988.57 |
Apply the (year, Store)’s CPI and Unemployment to missing data
sswwdd
ss_dd_YYYY-MM-DD
# Unique IDs in the data for tracking across data
df$id <- df$Store *10000 + df$week * 100 + df$Dept
df_test$id <- df_test$Store *10000 + df_test$week * 100 + df_test$Dept
# Id needed for submission
df_test$Id <- paste0(df_test$Store,'_',df_test$Dept,"_",df_test$date)
# Calculate average by store-dept and distribute to df_test
df <- df %>%
group_by(Store, Dept) %>%
mutate(store_avg=mean(Weekly_Sales, rm.na=T)) %>%
ungroup()
df_sa <- df %>%
group_by(Store, Dept) %>%
slice(1) %>%
select(Store, Dept, store_avg) %>%
ungroup()
df_test <- left_join(df_test, df_sa)
# 36 observations have invalid department codes -- ignore (set to 0)
df_test[is.na(df_test$store_avg),]$store_avg <- 0
# Calculate multipliers based on store_avg (and removing NaN and Inf)
df$Weekly_mult <- df$Weekly_Sales / df$store_avg
df[!is.finite(df$Weekly_mult),]$Weekly_mult <- NA
# Calculate mean by week-store-dept and distribute to df_test
df <- df %>%
group_by(Store, Dept, week) %>%
mutate(naive_mean=mean(Weekly_Sales, rm.na=T)) %>%
ungroup()
df_wm <- df %>%
group_by(Store, Dept, week) %>%
slice(1) %>%
ungroup() %>%
select(Store, Dept, week, naive_mean)
df_test <- df_test %>% arrange(Store, Dept, week)
df_test <- left_join(df_test, df_wm)
No data for testing…
We have this
mod1 <- lm(Weekly_mult ~ factor(IsHoliday) + factor(markdown>0) +
markdown + Temperature +
Fuel_Price + CPI + Unemployment,
data=df)
tidy(mod1)
# A tibble: 8 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 1.24 0.0370 33.5 4.10e-245
2 factor(IsHoliday)TRUE 0.0868 0.0124 6.99 2.67e- 12
3 factor(markdown > 0)TRUE 0.0531 0.00885 6.00 2.00e- 9
4 markdown 0.000000741 0.000000875 0.847 3.97e- 1
5 Temperature -0.000763 0.000181 -4.23 2.38e- 5
6 Fuel_Price -0.0706 0.00823 -8.58 9.90e- 18
7 CPI -0.0000837 0.0000887 -0.944 3.45e- 1
8 Unemployment 0.00410 0.00182 2.25 2.45e- 2
glance(mod1)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.000481 0.000464 2.03 29.0 2.96e-40 7 -895684. 1.79e6 1.79e6
# … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Out of sample result
df_test$Weekly_mult <- predict(mod1, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_linear.csv",
row.names=FALSE)
# track
df_test$WS_linear <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_linear <- predict(mod1, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear, holidays=df$IsHoliday)
names(w) <- "Linear"
wmaes <- c(w)
wmaes
Linear
3073.57
wmae_obs <- function(actual, predicted, holidays) {
abs(actual-predicted)*(holidays*5+1) / (length(actual) + 4*sum(holidays))
}
df$wmaes <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear,
holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes, x=week, color=factor(IsHoliday))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")
mod2 <- lm(Weekly_mult ~ factor(week) + factor(IsHoliday) + factor(markdown>0) +
markdown + Temperature +
Fuel_Price + CPI + Unemployment,
data=df)
tidy(mod2)
# A tibble: 60 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 1.00 0.0452 22.1 3.11e-108
2 factor(week)2 -0.0648 0.0372 -1.74 8.19e- 2
3 factor(week)3 -0.169 0.0373 -4.54 5.75e- 6
4 factor(week)4 -0.0716 0.0373 -1.92 5.47e- 2
5 factor(week)5 0.0544 0.0372 1.46 1.44e- 1
6 factor(week)6 0.161 0.0361 4.45 8.79e- 6
7 factor(week)7 0.265 0.0345 7.67 1.72e- 14
8 factor(week)8 0.109 0.0340 3.21 1.32e- 3
9 factor(week)9 0.0823 0.0340 2.42 1.55e- 2
10 factor(week)10 0.101 0.0341 2.96 3.04e- 3
# … with 50 more rows
glance(mod2)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.00501 0.00487 2.02 35.9 0 59 -894728. 1789577. 1.79e6
# … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Out of sample result
df_test$Weekly_mult <- predict(mod2, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_linear2.csv",
row.names=FALSE)
# track
df_test$WS_linear2 <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_linear2 <- predict(mod2, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear2, holidays=df$IsHoliday)
names(w) <- "Linear 2"
wmaes <- c(wmaes, w)
wmaes
Linear Linear 2
3073.570 3230.643
wmaes_out
Linear Linear 2
4993.4 5618.4
…
library(fixest)
mod3 <- feols(Weekly_mult ~ markdown +
Temperature +
Fuel_Price +
CPI +
Unemployment | id, data=df)
tidy(mod3)
# A tibble: 5 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 markdown -0.00000139 0.000000295 -4.73 2.24e- 6
2 Temperature 0.00135 0.000243 5.55 2.83e- 8
3 Fuel_Price -0.0637 0.00905 -7.04 1.92e-12
4 CPI 0.00150 0.000800 1.87 6.13e- 2
5 Unemployment -0.0303 0.00386 -7.85 4.14e-15
glance(mod3)
# A tibble: 1 × 9
r.squared adj.r.squared within.r.squared pseudo.r.squared sigma nobs AIC
<dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 0.823 0.712 0.000540 NA 1.09 421564 1.39e6
# … with 2 more variables: BIC <dbl>, logLik <dbl>
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_FE.csv",
row.names=FALSE)
# track
df_test$WS_FE <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_FE <- predict(mod3, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_FE, holidays=df$IsHoliday)
names(w) <- "FE"
wmaes <- c(wmaes, w)
wmaes
Linear Linear 2 FE
3073.570 3230.643 1552.190
wmaes_out
Linear Linear 2 FE
4993.4 5618.4 3379.3
Code is in the code file – a lot of dplyr
wmaes_out
Linear Linear 2 FE Shifted FE
4993.4 5618.4 3378.8 3274.8