Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
library(tidyverse)
weekly <- read.csv("../../Data/WMT_train.csv")
weekly.test <- read.csv("../../Data/WMT_test.csv")
weekly.features <- read.csv("../../Data/WMT_features.csv")
weekly.stores <- read.csv("../../Data/WMT_stores.csv")
wmae <- function(actual, predicted, holidays) {
sum(abs(actual-predicted)*(holidays*4+1)) / (length(actual) + 4*sum(holidays))
}
library(tidyverse) # we'll extensively use dplyr here
library(lubridate) # Great for simple date functions
library(broom)
weekly <- read.csv("../../Data/WMT_train.csv", stringsAsFactors=FALSE)
weekly.test <- read.csv("../../Data/WMT_test.csv", stringsAsFactors=FALSE)
weekly.features <- read.csv("../../Data/WMT_features.csv", stringsAsFactors=FALSE)
weekly.stores <- read.csv("../../Data/WMT_stores.csv", stringsAsFactors=FALSE)
preprocess_data <- function(df) {
# Merge the data together (Pulled from outside of function -- "scoping")
df <- inner_join(df, weekly.stores)
df <- inner_join(df, weekly.features[,1:11])
# Compress the weird markdown information to 1 variable
df$markdown <- 0
df[!is.na(df$MarkDown1),]$markdown <- df[!is.na(df$MarkDown1),]$MarkDown1
df[!is.na(df$MarkDown2),]$markdown <- df[!is.na(df$MarkDown2),]$MarkDown2
df[!is.na(df$MarkDown3),]$markdown <- df[!is.na(df$MarkDown3),]$MarkDown3
df[!is.na(df$MarkDown4),]$markdown <- df[!is.na(df$MarkDown4),]$MarkDown4
df[!is.na(df$MarkDown5),]$markdown <- df[!is.na(df$MarkDown5),]$MarkDown5
# Fix dates and add useful time variables
df$date <- as.Date(df$Date)
df$week <- week(df$date)
df$year <- year(df$date)
df
}
df <- preprocess_data(weekly)
Joining, by = "Store"
Joining, by = c("Store", "Date")
df_test <- preprocess_data(weekly.test)
Joining, by = "Store"
Joining, by = c("Store", "Date")
df[91:94,] %>%
select(Store, date, markdown, MarkDown3, MarkDown4, MarkDown5) %>%
html_df()
|
Store |
date |
markdown |
MarkDown3 |
MarkDown4 |
MarkDown5 |
91 |
1 |
2011-10-28 |
0.00 |
NA |
NA |
NA |
92 |
1 |
2011-11-04 |
0.00 |
NA |
NA |
NA |
93 |
1 |
2011-11-11 |
6551.42 |
215.07 |
2406.62 |
6551.42 |
94 |
1 |
2011-11-18 |
5988.57 |
51.98 |
427.39 |
5988.57 |
df[1:2,] %>% select(date, week, year) %>% html_df()
date |
week |
year |
2010-02-05 |
6 |
2010 |
2010-02-12 |
7 |
2010 |
# Fill in missing CPI and Unemployment data
df_test <- df_test %>%
group_by(Store, year) %>%
mutate(CPI=ifelse(is.na(CPI), mean(CPI,na.rm=T), CPI),
Unemployment=ifelse(is.na(Unemployment),
mean(Unemployment,na.rm=T),
Unemployment)) %>%
ungroup()
# Unique IDs in the data
df$id <- df$Store *10000 + df$week * 100 + df$Dept
df_test$id <- df_test$Store *10000 + df_test$week * 100 + df_test$Dept
# Unique ID and factor building
swd <- c(df$id, df_test$id) # Pool all IDs
swd <- unique(swd) # Only keep unique elements
swd <- data.frame(id=swd) # Make a data frame
swd$swd <- factor(swd$id) # Extract factors for using later
# Add unique factors to data -- ensures same factors for both data sets
df <- left_join(df,swd)
Joining, by = "id"
df_test <- left_join(df_test,swd)
Joining, by = "id"
df_test$Id <- paste0(df_test$Store,'_',df_test$Dept,"_",df_test$date)
html_df(df_test[c(20000,40000,60000),c("Store","week","Dept","id","swd","Id")])
Store |
week |
Dept |
id |
swd |
Id |
8 |
27 |
33 |
82733 |
82733 |
8_33_2013-07-05 |
15 |
46 |
91 |
154691 |
154691 |
15_91_2012-11-16 |
23 |
52 |
25 |
235225 |
235225 |
23_25_2012-12-28 |
# Calculate average by store-dept and distribute to df_test
df <- df %>%
group_by(Store, Dept) %>%
mutate(store_avg=mean(Weekly_Sales, rm.na=T)) %>%
ungroup()
df_sa <- df %>%
group_by(Store, Dept) %>%
slice(1) %>%
select(Store, Dept, store_avg) %>%
ungroup()
df_test <- left_join(df_test, df_sa)
Joining, by = c("Store", "Dept")
# 36 observations have messed up department codes -- ignore (set to 0)
df_test[is.na(df_test$store_avg),]$store_avg <- 0
# Calculate multipliers based on store_avg (and removing NaN and Inf)
df$Weekly_mult <- df$Weekly_Sales / df$store_avg
df[!is.finite(df$Weekly_mult),]$Weekly_mult <- NA
# Calculate mean by week-store-dept and distribute to df_test
df <- df %>%
group_by(Store, Dept, week) %>%
mutate(naive_mean=mean(Weekly_Sales, rm.na=T)) %>%
ungroup()
df_wm <- df %>%
group_by(Store, Dept, week) %>%
slice(1) %>%
ungroup() %>%
select(Store, Dept, week, naive_mean)
df_test <- df_test %>% arrange(Store, Dept, week)
df_test <- left_join(df_test, df_wm)
Joining, by = c("Store", "Dept", "week")
table(is.na(df_test$naive_mean))
FALSE TRUE
113827 1237
# there are a lot of scattered missing weeks -- fill with a lag first
df_test$naive_na <- is.na(df_test$naive_mean)
df_test <- df_test %>%
arrange(Store, Dept, date) %>%
group_by(Store, Dept) %>%
mutate(naive_mean=ifelse(is.na(naive_mean), lag(naive_mean),naive_mean)) %>%
ungroup()
df_test <- df_test %>%
arrange(Store, Dept, date) %>%
group_by(Store, Dept) %>%
mutate(naive_mean=ifelse(is.na(naive_mean), lag(naive_mean,2),naive_mean)) %>%
ungroup()
df_test <- df_test %>%
arrange(Store, Dept, date) %>%
group_by(Store, Dept) %>%
mutate(naive_mean=ifelse(is.na(naive_mean), lead(naive_mean,1),naive_mean)) %>%
ungroup()
df_test <- df_test %>%
arrange(Store, Dept, date) %>%
group_by(Store, Dept) %>%
mutate(naive_mean=ifelse(is.na(naive_mean), lead(naive_mean,2),naive_mean)) %>%
ungroup()
df_test$naive_mean <- ifelse(is.na(df_test$naive_mean), df_test$store_avg, df_test$naive_mean)
df %>%
group_by(week, Store) %>%
mutate(sales=mean(Weekly_Sales)) %>%
slice(1) %>%
ungroup() %>%
ggplot(aes(y=sales, x=week, color=factor(Store))) +
geom_line() + xlab("Week") + ylab("Sales for Store (dept average)") +
theme(legend.position="none")

mod1 <- lm(Weekly_mult ~ factor(IsHoliday) + factor(markdown>0) +
markdown + Temperature +
Fuel_Price + CPI + Unemployment,
data=df)
tidy(mod1)
glance(mod1)
# Out of sample result
df_test$Weekly_mult <- predict(mod1, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_linear.csv",
row.names=FALSE)
# track
df_test$WS_linear <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_linear <- predict(mod1, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear, holidays=df$IsHoliday)
names(w) <- "Linear"
wmaes <- c(w)
wmaes
Linear
3073.57
wmae_obs <- function(actual, predicted, holidays) {
abs(actual-predicted)*(holidays*5+1) / (length(actual) + 4*sum(holidays))
}
df$wmaes <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear,
holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes, x=week, color=factor(IsHoliday))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")

mod2 <- lm(Weekly_mult ~ factor(week) + factor(IsHoliday) + factor(markdown>0) +
markdown + Temperature +
Fuel_Price + CPI + Unemployment,
data=df)
tidy(mod2)
glance(mod2)
# Out of sample result
df_test$Weekly_mult <- predict(mod2, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_linear2.csv",
row.names=FALSE)
# track
df_test$WS_linear2 <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_linear2 <- predict(mod2, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear2, holidays=df$IsHoliday)
names(w) <- "Linear 2"
wmaes <- c(wmaes, w)
wmaes
Linear Linear 2
3073.570 3230.643
wmaes_out <- c(4993.4, 5618.4)
names(wmaes_out) <- c("Linear", "Linear 2")
wmaes_out
Linear Linear 2
4993.4 5618.4
df$wmaes <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes,
x=week,
color=factor(IsHoliday))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")

ggplot(data=df, aes(y=wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
holidays=df$IsHoliday),
x=week,
color=factor(Store))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE") +
theme(legend.position="none")

ggplot(data=df, aes(y=wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
holidays=df$IsHoliday),
x=week,
color=factor(Dept))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE") +
theme(legend.position="none")

library(lfe)
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following object is masked from 㤼㸱package:tidyr㤼㸲:
expand
mod3 <- felm(Weekly_mult ~ markdown +
Temperature +
Fuel_Price +
CPI +
Unemployment | swd, data=df)
tidy(mod3)
glance(mod3)
predict.felm <- function(object, newdata, use.fe=T, ...) {
# compatible with tibbles
newdata <- as.data.frame(newdata)
co <- coef(object)
y.pred <- t(as.matrix(unname(co))) %*% t(as.matrix(newdata[,names(co)]))
fe.vars <- names(object$fe)
all.fe <- getfe(object)
for (fe.var in fe.vars) {
level <- all.fe[all.fe$fe == fe.var,]
frows <- match(newdata[[fe.var]],level$idx)
myfe <- level$effect[frows]
myfe[is.na(myfe)] = 0
y.pred <- y.pred + myfe
}
as.vector(y.pred)
}
# Out of sample result
df_test$Weekly_mult <- predict(mod3, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg
# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
"WMT_FE.csv",
row.names=FALSE)
# track
df_test$WS_FE <- df_test$Weekly_Sales
# Check in sample WMAE
df$WS_FE <- predict(mod3, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_FE, holidays=df$IsHoliday)
names(w) <- "FE"
wmaes <- c(wmaes, w)
wmaes
Linear Linear 2 FE
3073.570 3230.643 1552.173
wmaes_out <- c(4993.4, 5618.4, 3378.8)
names(wmaes_out) <- c("Linear", "Linear 2", "FE")
wmaes_out
Linear Linear 2 FE
4993.4 5618.4 3378.8
df$wmaes <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_FE,
holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes,
x=week,
color=factor(IsHoliday))) +
geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")

# Function to map holidays
holidf <- function(df, years, weeks) {
if(length(years) == 2) {
years = c(years, 9999)
weeks = c(weeks, 99)
}
df %>%
filter(week(df$date) == weeks[1] & year(df$date) == years[1] |
week(df$date) == weeks[2] & year(df$date) == years[2] |
week(df$date) == weeks[3] & year(df$date) == years[3]) %>%
select(Store,Dept,Weekly_Sales) %>%
group_by(Store,Dept) %>%
mutate(holiday_sales=mean(Weekly_Sales,rm.na=T)) %>%
slice(1) %>%
ungroup() %>%
select(Store,Dept,holiday_sales) %>%
as.data.frame()
}
# Add holiday shift to data
add_holiday <- function(df, holiday, wk) {
q <- left_join(df, holiday)
q <- q %>% transmute(Weekly_Sales=ifelse(week==wk,holiday_sales,Weekly_Sales))
q[[1]]
}
# Start with our FE approach
df_test$Weekly_Sales <- df_test$WS_FE
# CHRISTMAS
# 2010: 53, 2011: 52, 2012: *52*
# Get a df with Christmas specific sales
s_xmas <- holidf(df, c(2010,2011), c(53,52))
s_xmas_m1 <- holidf(df, c(2010,2011), c(52,51))
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas, 52)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas_m1, 51)
Joining, by = c("Store", "Dept")
# BLACK FRIDAY 2010-11-26: 48; 2011-11-25: 47, 2012-11-23: *47*
s_bf <- holidf(df, c(2010,2011), c(48,47))
df_test$Weekly_Sales <- add_holiday(df_test, s_bf, 47)
Joining, by = c("Store", "Dept")
# Note that the superbowl is weeks 6, 6, 7, *6*
s_sb <- holidf(df, c(2010,2011,2012), c(6,6,7))
s_sb_m1 <- holidf(df, c(2010,2011,2012), c(5,5,6))
df_test$Weekly_Sales <- add_holiday(df_test, s_sb, 7)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_sb_m1, 6)
Joining, by = c("Store", "Dept")
# Clean up any missing values added
df_test[is.na(df_test$Weekly_Sales),]$Weekly_Sales <- df_test[is.na(df_test$Weekly_Sales),]$naive_mean
# Calculate yearly growt
# A bit difficult since the data is a partial year, a full year, and a partial year
yg1 <- df %>%
arrange(Store, year) %>%
filter(year == 2010) %>%
group_by(Store, year) %>%
mutate(sales2010=mean(Weekly_Sales)) %>%
slice(1) %>%
ungroup() %>%
select(Store, sales2010)
yg2 <- df %>%
arrange(Store, year) %>%
filter(date > as.Date("2011-02-05") & year == 2011) %>%
group_by(Store, year) %>%
mutate(sales2011a=mean(Weekly_Sales)) %>%
slice(1) %>%
ungroup() %>%
select(Store, sales2011a)
yg3 <- df %>%
arrange(Store, year) %>%
filter(date < as.Date("2011-10-06") & year == 2011) %>%
group_by(Store, year) %>%
mutate(sales2011b=mean(Weekly_Sales)) %>%
slice(1) %>%
ungroup() %>%
select(Store, sales2011b)
yg4 <- df %>%
arrange(Store, year) %>%
filter(year == 2012) %>%
group_by(Store, year) %>%
mutate(sales2012=mean(Weekly_Sales)) %>%
slice(1) %>%
ungroup() %>%
select(Store, sales2012)
yg <- full_join(yg1,yg2) %>% full_join(yg3) %>% full_join(yg4)
Joining, by = "Store"
Joining, by = "Store"
Joining, by = "Store"
yg <- yg %>%
mutate(growth1=sales2011a/sales2010,
growth2=sales2012/sales2011b,
growth=ifelse(is.na(growth1),ifelse(is.na(growth2),1,growth2),ifelse(is.na(growth2),1,(growth1+growth2)/2))) %>%
select(Store, growth)
# Add growth data to our df_test data frame
df_test <- left_join(df_test, yg)
Joining, by = "Store"
df_test[df_test$year == 2012,]$Weekly_Sales <- df_test[df_test$year == 2012,]$Weekly_Sales * df_test[df_test$year == 2012,]$growth
df_test[df_test$year == 2013,]$Weekly_Sales <- df_test[df_test$year == 2013,]$Weekly_Sales * df_test[df_test$year == 2013,]$growth
write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_FE_shift.csv", row.names=FALSE)
df_test$WS_FE_shift <- df_test$Weekly_Sales
# BONUS: Ensembling
# Ensemble: Building multiple models and combining them into 1
# This example: add in a naive mean approach with shifting + growth
df_test$Weekly_Sales <- df_test$naive_mean
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas, 52)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas_m1, 51)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_bf, 47)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_sb, 7)
Joining, by = c("Store", "Dept")
df_test$Weekly_Sales <- add_holiday(df_test, s_sb_m1, 6)
Joining, by = c("Store", "Dept")
df_test[is.na(df_test$Weekly_Sales),]$Weekly_Sales <- df_test[is.na(df_test$Weekly_Sales),]$naive_mean
df_test[df_test$year == 2012,]$Weekly_Sales <- df_test[df_test$year == 2012,]$Weekly_Sales * df_test[df_test$year == 2012,]$growth
df_test[df_test$year == 2013,]$Weekly_Sales <- df_test[df_test$year == 2013,]$Weekly_Sales * df_test[df_test$year == 2013,]$growth
write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_naivemean.csv", row.names=FALSE)
df_test$WS_naivemean <- df_test$Weekly_Sales
# Ensembles -- in this case, these don't work so well though. Better with more models
df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,min)
#df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,max)
#df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,mean)
write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_ens.csv", row.names=FALSE)
wmaes_out <- c(4993.4, 5618.4, 3378.8, 3274.2)
names(wmaes_out) <- c("Linear", "Linear 2", "FE", "Shifted FE")
wmaes_out
Linear Linear 2 FE Shifted FE
4993.4 5618.4 3378.8 3274.2
---
title: "Code for Session Project"
author: "Dr. Richard M. Crowley"
date: ""
output:
  html_notebook
---

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

```{r helpers, warning=FALSE}
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
  if(!length(cols)) {
    cols=colnames(text)
  }
  if(!col1) {
    kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
      kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
  } else {
    kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
      kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
      column_spec(1,bold=T)
  }
}
```

```{r}
library(tidyverse)
weekly <- read.csv("../../Data/WMT_train.csv")
weekly.test <- read.csv("../../Data/WMT_test.csv")
weekly.features <- read.csv("../../Data/WMT_features.csv")
weekly.stores <- read.csv("../../Data/WMT_stores.csv")
```

```{r}
wmae <- function(actual, predicted, holidays) {
  sum(abs(actual-predicted)*(holidays*4+1)) / (length(actual) + 4*sum(holidays))
}
```

```{r, message=F}
library(tidyverse)  # we'll extensively use dplyr here
library(lubridate)  # Great for simple date functions
library(broom)
weekly <- read.csv("../../Data/WMT_train.csv", stringsAsFactors=FALSE)
weekly.test <- read.csv("../../Data/WMT_test.csv", stringsAsFactors=FALSE)
weekly.features <- read.csv("../../Data/WMT_features.csv", stringsAsFactors=FALSE)
weekly.stores <- read.csv("../../Data/WMT_stores.csv", stringsAsFactors=FALSE)
```

```{r, message=F}
preprocess_data <- function(df) {
  # Merge the data together (Pulled from outside of function -- "scoping")
  df <- inner_join(df, weekly.stores)
  df <- inner_join(df, weekly.features[,1:11])
  
  # Compress the weird markdown information to 1 variable
  df$markdown <- 0
  df[!is.na(df$MarkDown1),]$markdown <- df[!is.na(df$MarkDown1),]$MarkDown1
  df[!is.na(df$MarkDown2),]$markdown <- df[!is.na(df$MarkDown2),]$MarkDown2
  df[!is.na(df$MarkDown3),]$markdown <- df[!is.na(df$MarkDown3),]$MarkDown3
  df[!is.na(df$MarkDown4),]$markdown <- df[!is.na(df$MarkDown4),]$MarkDown4
  df[!is.na(df$MarkDown5),]$markdown <- df[!is.na(df$MarkDown5),]$MarkDown5
  
  # Fix dates and add useful time variables
  df$date <- as.Date(df$Date)
  df$week <- week(df$date)
  df$year <- year(df$date)
  
  df
}
```

```{r, message=F}
df <- preprocess_data(weekly)
df_test <- preprocess_data(weekly.test)
```

```{r}
df[91:94,] %>%
  select(Store, date, markdown, MarkDown3, MarkDown4, MarkDown5) %>%
  html_df()
```

```{r}
df[1:2,] %>% select(date, week, year) %>% html_df()
```

```{r}
# Fill in missing CPI and Unemployment data
df_test <- df_test %>%
  group_by(Store, year) %>%
  mutate(CPI=ifelse(is.na(CPI), mean(CPI,na.rm=T), CPI),
         Unemployment=ifelse(is.na(Unemployment),
                             mean(Unemployment,na.rm=T),
                             Unemployment)) %>%
  ungroup()
```

```{r, message=F}
# Unique IDs in the data
df$id <- df$Store *10000 + df$week * 100 + df$Dept
df_test$id <- df_test$Store *10000 + df_test$week * 100 + df_test$Dept

# Unique ID and factor building
swd <- c(df$id, df_test$id)  # Pool all IDs
swd <- unique(swd)  # Only keep unique elements
swd <- data.frame(id=swd)  # Make a data frame
swd$swd <- factor(swd$id)  # Extract factors for using later

# Add unique factors to data -- ensures same factors for both data sets
df <- left_join(df,swd)
df_test <- left_join(df_test,swd)
```

```{r}
df_test$Id <- paste0(df_test$Store,'_',df_test$Dept,"_",df_test$date)
```

```{r}
html_df(df_test[c(20000,40000,60000),c("Store","week","Dept","id","swd","Id")])
```

```{r}
# Calculate average by store-dept and distribute to df_test
df <- df %>%
  group_by(Store, Dept) %>% 
  mutate(store_avg=mean(Weekly_Sales, rm.na=T)) %>%
  ungroup()
df_sa <- df %>%
  group_by(Store, Dept) %>%
  slice(1) %>%
  select(Store, Dept, store_avg) %>%
  ungroup()
df_test <- left_join(df_test, df_sa)
# 36 observations have messed up department codes -- ignore (set to 0)
df_test[is.na(df_test$store_avg),]$store_avg  <- 0

# Calculate multipliers based on store_avg (and removing NaN and Inf)
df$Weekly_mult <- df$Weekly_Sales / df$store_avg
df[!is.finite(df$Weekly_mult),]$Weekly_mult <- NA
```

```{r}
# Calculate mean by week-store-dept and distribute to df_test
df <- df %>%
  group_by(Store, Dept, week) %>%
  mutate(naive_mean=mean(Weekly_Sales, rm.na=T)) %>%
  ungroup()
df_wm <- df %>%
  group_by(Store, Dept, week) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store, Dept, week, naive_mean)
df_test <- df_test %>% arrange(Store, Dept, week)
df_test <- left_join(df_test, df_wm)
```

```{r}
table(is.na(df_test$naive_mean))
```

```{r, message=F, warning=F}
# there are a lot of scattered missing weeks -- fill with a lag first
df_test$naive_na <- is.na(df_test$naive_mean)
df_test <- df_test %>%
  arrange(Store, Dept, date) %>%
  group_by(Store, Dept) %>%
  mutate(naive_mean=ifelse(is.na(naive_mean), lag(naive_mean),naive_mean)) %>%
  ungroup()
df_test <- df_test %>%
  arrange(Store, Dept, date) %>%
  group_by(Store, Dept) %>%
  mutate(naive_mean=ifelse(is.na(naive_mean), lag(naive_mean,2),naive_mean)) %>%
  ungroup()
df_test <- df_test %>%
  arrange(Store, Dept, date) %>%
  group_by(Store, Dept) %>%
  mutate(naive_mean=ifelse(is.na(naive_mean), lead(naive_mean,1),naive_mean)) %>%
  ungroup()
df_test <- df_test %>%
  arrange(Store, Dept, date) %>%
  group_by(Store, Dept) %>%
  mutate(naive_mean=ifelse(is.na(naive_mean), lead(naive_mean,2),naive_mean)) %>%
  ungroup()
df_test$naive_mean <- ifelse(is.na(df_test$naive_mean), df_test$store_avg, df_test$naive_mean)
```

```{r, fig.height=3}
df %>% 
  group_by(week, Store) %>%
  mutate(sales=mean(Weekly_Sales)) %>%
  slice(1) %>%
  ungroup() %>%
  ggplot(aes(y=sales, x=week, color=factor(Store))) +
  geom_line() + xlab("Week") + ylab("Sales for Store (dept average)") + 
  theme(legend.position="none")
```

```{r}
mod1 <- lm(Weekly_mult ~ factor(IsHoliday) + factor(markdown>0) +
                         markdown + Temperature +
                         Fuel_Price + CPI + Unemployment,
           data=df)
tidy(mod1)
glance(mod1)
```

```{r}
# Out of sample result
df_test$Weekly_mult <- predict(mod1, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg

# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
          "WMT_linear.csv",
          row.names=FALSE)

# track
df_test$WS_linear <- df_test$Weekly_Sales

# Check in sample WMAE
df$WS_linear <- predict(mod1, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear, holidays=df$IsHoliday)
names(w) <- "Linear"
wmaes <- c(w)
wmaes
```

```{r fig.height=4}
wmae_obs <- function(actual, predicted, holidays) {
  abs(actual-predicted)*(holidays*5+1) / (length(actual) + 4*sum(holidays))
}
df$wmaes  <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear,
                      holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes, x=week, color=factor(IsHoliday))) + 
  geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")
```

```{r}
mod2 <- lm(Weekly_mult ~ factor(week) + factor(IsHoliday) + factor(markdown>0) +
                         markdown + Temperature +
                         Fuel_Price + CPI + Unemployment,
           data=df)
tidy(mod2)
glance(mod2)
```

```{r, message=F}
# Out of sample result
df_test$Weekly_mult <- predict(mod2, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg

# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
          "WMT_linear2.csv",
          row.names=FALSE)

# track
df_test$WS_linear2 <- df_test$Weekly_Sales

# Check in sample WMAE
df$WS_linear2 <- predict(mod2, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_linear2, holidays=df$IsHoliday)
names(w) <- "Linear 2"
wmaes <- c(wmaes, w)
wmaes
```

```{r}
wmaes_out <- c(4993.4, 5618.4)
names(wmaes_out) <- c("Linear", "Linear 2")
```

```{r}
wmaes_out
```

```{r, fig.height=4}
df$wmaes  <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
                      holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes,
                    x=week,
                    color=factor(IsHoliday))) + 
  geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")
```

```{r, fig.height=4}
ggplot(data=df, aes(y=wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
                               holidays=df$IsHoliday),
                    x=week,
                    color=factor(Store))) + 
  geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE") + 
  theme(legend.position="none")
```

```{r, fig.height=4}
ggplot(data=df, aes(y=wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_linear2,
                               holidays=df$IsHoliday),
                    x=week,
                    color=factor(Dept))) + 
  geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE") + 
  theme(legend.position="none")
```

```{r, message=F}
library(lfe)
mod3 <- felm(Weekly_mult ~ markdown +
            Temperature +
            Fuel_Price +
            CPI +
            Unemployment | swd, data=df)
tidy(mod3)
glance(mod3)
```

```{r}
predict.felm <- function(object, newdata, use.fe=T, ...) {
  # compatible with tibbles
  newdata <- as.data.frame(newdata)
  co <- coef(object)
  
  y.pred <- t(as.matrix(unname(co))) %*% t(as.matrix(newdata[,names(co)]))
  
  fe.vars <- names(object$fe)
  
  all.fe <- getfe(object)
  for (fe.var in fe.vars) {
    level <- all.fe[all.fe$fe == fe.var,]
    frows <- match(newdata[[fe.var]],level$idx)
    myfe <- level$effect[frows]
    myfe[is.na(myfe)] = 0
      
    y.pred <- y.pred + myfe
  }
  as.vector(y.pred)
}
```

```{r, message=F}
# Out of sample result
df_test$Weekly_mult <- predict(mod3, df_test)
df_test$Weekly_Sales <- df_test$Weekly_mult * df_test$store_avg

# Required to submit a csv of Id and Weekly_Sales
write.csv(df_test[,c("Id","Weekly_Sales")],
          "WMT_FE.csv",
          row.names=FALSE)

# track
df_test$WS_FE <- df_test$Weekly_Sales

# Check in sample WMAE
df$WS_FE <- predict(mod3, df) * df$store_avg
w <- wmae(actual=df$Weekly_Sales, predicted=df$WS_FE, holidays=df$IsHoliday)
names(w) <- "FE"
wmaes <- c(wmaes, w)
wmaes
```

```{r}
wmaes_out <- c(4993.4, 5618.4, 3378.8)
names(wmaes_out) <- c("Linear", "Linear 2", "FE")
```

```{r}
wmaes_out
```

```{r, fig.height=4}
df$wmaes  <- wmae_obs(actual=df$Weekly_Sales, predicted=df$WS_FE,
                      holidays=df$IsHoliday)
ggplot(data=df, aes(y=wmaes,
                    x=week,
                    color=factor(IsHoliday))) + 
  geom_jitter(width=0.25) + xlab("Week") + ylab("WMAE")
```

```{r}
# Function to map holidays
holidf <- function(df, years, weeks) {
  if(length(years) == 2) {
    years = c(years, 9999)
    weeks = c(weeks, 99)
  }
  df %>%
  filter(week(df$date) == weeks[1] & year(df$date) == years[1] |
         week(df$date) == weeks[2] & year(df$date) == years[2] |
         week(df$date) == weeks[3] & year(df$date) == years[3]) %>%
  select(Store,Dept,Weekly_Sales) %>%
  group_by(Store,Dept) %>%
  mutate(holiday_sales=mean(Weekly_Sales,rm.na=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store,Dept,holiday_sales) %>%
  as.data.frame()
}
# Add holiday shift to data
add_holiday <- function(df, holiday, wk) {
  q <- left_join(df, holiday)
  q <- q %>% transmute(Weekly_Sales=ifelse(week==wk,holiday_sales,Weekly_Sales))
  q[[1]]
}

# Start with our FE approach
df_test$Weekly_Sales <- df_test$WS_FE

# CHRISTMAS
# 2010: 53, 2011: 52, 2012: *52*
# Get a df with Christmas specific sales
s_xmas <- holidf(df, c(2010,2011), c(53,52))
s_xmas_m1 <- holidf(df, c(2010,2011), c(52,51))

df_test$Weekly_Sales <- add_holiday(df_test, s_xmas, 52)
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas_m1, 51)

# BLACK FRIDAY 2010-11-26: 48; 2011-11-25: 47, 2012-11-23: *47*
s_bf <- holidf(df, c(2010,2011), c(48,47))

df_test$Weekly_Sales <- add_holiday(df_test, s_bf, 47)

# Note that the superbowl is weeks 6, 6, 7, *6*
s_sb <- holidf(df, c(2010,2011,2012), c(6,6,7))
s_sb_m1 <- holidf(df, c(2010,2011,2012), c(5,5,6))

df_test$Weekly_Sales <- add_holiday(df_test, s_sb, 7)
df_test$Weekly_Sales <- add_holiday(df_test, s_sb_m1, 6)

# Clean up any missing values added
df_test[is.na(df_test$Weekly_Sales),]$Weekly_Sales <- df_test[is.na(df_test$Weekly_Sales),]$naive_mean


# Calculate yearly growt
# A bit difficult since the data is a partial year, a full year, and a partial year
yg1 <- df %>%
  arrange(Store, year) %>%
  filter(year == 2010) %>%
  group_by(Store, year) %>%
  mutate(sales2010=mean(Weekly_Sales)) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store, sales2010)
yg2 <- df %>%
  arrange(Store, year) %>%
  filter(date > as.Date("2011-02-05") & year == 2011) %>%
  group_by(Store, year) %>%
  mutate(sales2011a=mean(Weekly_Sales)) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store, sales2011a)
yg3 <- df %>%
  arrange(Store, year) %>%
  filter(date < as.Date("2011-10-06") & year == 2011) %>%
  group_by(Store, year) %>%
  mutate(sales2011b=mean(Weekly_Sales)) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store, sales2011b)
yg4 <- df %>%
  arrange(Store, year) %>%
  filter(year == 2012) %>%
  group_by(Store, year) %>%
  mutate(sales2012=mean(Weekly_Sales)) %>%
  slice(1) %>%
  ungroup() %>%
  select(Store, sales2012)
yg <- full_join(yg1,yg2) %>% full_join(yg3) %>% full_join(yg4)
yg <- yg %>%
  mutate(growth1=sales2011a/sales2010,
         growth2=sales2012/sales2011b,
         growth=ifelse(is.na(growth1),ifelse(is.na(growth2),1,growth2),ifelse(is.na(growth2),1,(growth1+growth2)/2))) %>%
  select(Store, growth)

# Add growth data to our df_test data frame
df_test <- left_join(df_test, yg)
df_test[df_test$year == 2012,]$Weekly_Sales <- df_test[df_test$year == 2012,]$Weekly_Sales * df_test[df_test$year == 2012,]$growth
df_test[df_test$year == 2013,]$Weekly_Sales <- df_test[df_test$year == 2013,]$Weekly_Sales * df_test[df_test$year == 2013,]$growth

write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_FE_shift.csv", row.names=FALSE)
df_test$WS_FE_shift <- df_test$Weekly_Sales

# BONUS: Ensembling
# Ensemble: Building multiple models and combining them into 1

# This example: add in a naive mean approach with shifting + growth
df_test$Weekly_Sales <- df_test$naive_mean
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas, 52)
df_test$Weekly_Sales <- add_holiday(df_test, s_xmas_m1, 51)
df_test$Weekly_Sales <- add_holiday(df_test, s_bf, 47)
df_test$Weekly_Sales <- add_holiday(df_test, s_sb, 7)
df_test$Weekly_Sales <- add_holiday(df_test, s_sb_m1, 6)
df_test[is.na(df_test$Weekly_Sales),]$Weekly_Sales <- df_test[is.na(df_test$Weekly_Sales),]$naive_mean
df_test[df_test$year == 2012,]$Weekly_Sales <- df_test[df_test$year == 2012,]$Weekly_Sales * df_test[df_test$year == 2012,]$growth
df_test[df_test$year == 2013,]$Weekly_Sales <- df_test[df_test$year == 2013,]$Weekly_Sales * df_test[df_test$year == 2013,]$growth
write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_naivemean.csv", row.names=FALSE)
df_test$WS_naivemean <- df_test$Weekly_Sales

# Ensembles -- in this case, these don't work so well though.  Better with more models
df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,min)
#df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,max)
#df_test$Weekly_Sales <- apply(df_test[,c("WS_FE_shift","WS_naivemean")],1,mean)

write.csv(df_test[,c("Id","Weekly_Sales")], file="WMT_ens.csv", row.names=FALSE)

```

```{r}
wmaes_out <- c(4993.4, 5618.4, 3378.8, 3274.2)
names(wmaes_out) <- c("Linear", "Linear 2", "FE", "Shifted FE")
```

```{r}
wmaes_out
```

