Code for Session 4

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)
library(broom)
# Import Walmart data from Session_Project
library(lubridate)
preprocess_data <- function(df) {
  # Merge the data together (Pulled from outside of function -- "scoping")
  df <- df %>%
    inner_join(weekly.stores) %>%
    inner_join(weekly.features[,1:11])
  
  # Compress the weird markdown information to 1 variable
  df <- df %>%
    mutate(markdown = 0,
           markdown = ifelse(!is.na(MarkDown1),MarkDown1,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown2,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown3,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown4,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown5,markdown))
  
  # Fix dates and add useful time variables
  df <- df %>%
    mutate(date = as.Date(Date),
           week = week(date),
           year = year(date),
           month = month(date))

  df
}

weekly <- read_csv("../../Data/WMT_train.csv")
weekly.features <- read_csv("../../Data/WMT_features.csv")
weekly.stores <- read_csv("../../Data/WMT_stores.csv")
df <- preprocess_data(weekly)

df <- df %>%
  group_by(Store, Dept) %>% 
  mutate(store_avg=mean(Weekly_Sales, rm.na=T)) %>%
  ungroup()
# Create the binary variable from Walmart sales data
df <- df %>% mutate(double = ifelse(Weekly_Sales > store_avg*2,1,0))
fit <- glm(double ~ IsHoliday, data=df, family=binomial)
tidy(fit)
# A tibble: 2 × 5
  term          estimate std.error statistic  p.value
  <chr>            <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     -3.45    0.00924    -373.  0       
2 IsHolidayTRUE    0.539   0.0278       19.4 1.09e-83
# Automating the above:
exp(coef(fit))
  (Intercept) IsHolidayTRUE 
   0.03184725    1.71367497 
test_data <- data.frame(IsHoliday = c(F,T))
predict(fit, test_data)  # log odds
        1         2 
-3.446804 -2.908164 
# probabilities
predict(fit, test_data, type="response")
         1          2 
0.03086431 0.05175146 
a <- exp(-3.446804)
b <- exp(-2.908164)
c(a/(1+a), b/(1+b))
[1] 0.03086431 0.05175146
fit2 <- glm(double ~ IsHoliday + Temperature + Fuel_Price, data=df, family=binomial)
summary(fit2)

Call:
glm(formula = double ~ IsHoliday + Temperature + Fuel_Price, 
    family = binomial, data = df)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -1.7764917  0.0673246  -26.39   <2e-16 ***
IsHolidayTRUE  0.3704298  0.0284395   13.03   <2e-16 ***
Temperature   -0.0108268  0.0004698  -23.04   <2e-16 ***
Fuel_Price    -0.3091950  0.0196234  -15.76   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 120370  on 421569  degrees of freedom
Residual deviance: 119146  on 421566  degrees of freedom
AIC: 119154

Number of Fisher Scoring iterations: 6
# Odds
exp(coef(fit2))
  (Intercept) IsHolidayTRUE   Temperature    Fuel_Price 
    0.1692308     1.4483570     0.9892316     0.7340376 
# Typical September days
hday_sep    <- mean(predict(fit2, filter(df, IsHoliday, month==9), type="response"))
no_hday_sep <-  mean(predict(fit2, filter(df, !IsHoliday, month==9), type="response"))
# Typical December days
hday_dec    <- mean(predict(fit2, filter(df, IsHoliday, month==12), type="response"))
no_hday_dec <-  mean(predict(fit2, filter(df, !IsHoliday, month==12), type="response"))

html_df(data.frame(Month=c(9,9,12,12), IsHoliday=c(FALSE,TRUE,FALSE,TRUE),
                   Probability=c(no_hday_sep, hday_sep, no_hday_dec, hday_dec)))
Month IsHoliday Probability
9 FALSE 0.0266789
9 TRUE 0.0374761
12 FALSE 0.0398377
12 TRUE 0.0586483
percent <- function(x) {
  paste0(round(x*100,2),"%")
}
library(rlang)
# Another use for a quosure :)
# You could always just use quoted arguments here though
extract_margin <- function(m, x) {
  x <- enquo(x)
  percent(filter(summary(m), factor==quo_text(x))$AME)
}
# Calculate AME marginal effects
library(margins)
m <- margins(fit2)
m
 Temperature Fuel_Price IsHoliday
  -0.0003377  -0.009644   0.01334
summary(m) %>%
  html_df()
factor AME SE z p lower upper
Fuel_Price -0.0096438 0.0006163 -15.64800 0 -0.0108517 -0.0084359
IsHoliday 0.0133450 0.0011754 11.35372 0 0.0110413 0.0156487
Temperature -0.0003377 0.0000149 -22.71256 0 -0.0003668 -0.0003085
plot(m, which=summary(m)$factor)

margins(fit2, at = list(IsHoliday = c(TRUE, FALSE)),
        variables = c("Temperature", "Fuel_Price")) %>%
  summary() %>%
  html_df()
factor IsHoliday AME SE z p lower upper
Fuel_Price FALSE -0.0093401 0.0005989 -15.59617 0 -0.0105139 -0.0081664
Fuel_Price TRUE -0.0131335 0.0008717 -15.06650 0 -0.0148420 -0.0114250
Temperature FALSE -0.0003271 0.0000146 -22.46024 0 -0.0003556 -0.0002985
Temperature TRUE -0.0004599 0.0000210 -21.92927 0 -0.0005010 -0.0004188
margins(fit2, at = list(Temperature = c(0, 25, 50, 75, 100)),
        variables = c("IsHoliday")) %>%
  summary() %>%
  html_df()
factor Temperature AME SE z p lower upper
IsHoliday 0 0.0234484 0.0020168 11.62643 0 0.0194955 0.0274012
IsHoliday 25 0.0184956 0.0015949 11.59704 0 0.0153697 0.0216214
IsHoliday 50 0.0144798 0.0012679 11.42060 0 0.0119948 0.0169648
IsHoliday 75 0.0112693 0.0010161 11.09035 0 0.0092777 0.0132609
IsHoliday 100 0.0087305 0.0008213 10.62977 0 0.0071207 0.0103402
# load data
df <- read_csv("../../Data/Shipping/shipping_dataset.csv", na="NA")
Rows: 71460 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): left_type, arrive_type, status
dbl (11): imo, run, lat, lon, last_update, left_time, left_port, arrive_time...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_ships <- read_csv("../../Data/Shipping/index.csv", na="NA")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 2512 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): flag, shipname, url, photo, photos link, recognized_next_port, rep...
dbl  (4): imo, lat_of_latest_position, lon_of_latest_position, year_of_build

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_ports <- read_csv("../../Data/Shipping/port_dataset.csv", na="NA")
Rows: 1117 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (3): port, lat, lon

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
typhoon <- read_csv("../../Data/Shipping/typhoon.csv")
Rows: 33419 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): date, time, constrnt, wkng, rpd, scene, est_rmw, mthd, sat, commen...
dbl (13): intensity_ci, intensity_mslp, intensity_vmax, tno_fnl, tno_adj, tn...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# rename lat and lon to avoid conflicts
df_ports <- df_ports %>% rename(port_lat=lat, port_lon=lon)

# Join in ships and ports
df <- df %>% 
  left_join(df_ships) %>%
  filter(!imo == 9528029 | reported_destination == "AU ADL") %>%
  left_join(df_ports, by=c("left_port"="port")) %>%
  rename(left_lat=port_lat, left_lon=port_lon) %>%
  left_join(df_ports, by=c("arrive_port"="port")) %>%
  rename(arrive_lat=port_lat, arrive_lon=port_lon)
Joining with `by = join_by(imo)`
Warning in left_join(., df_ships): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 847 of `x` matches multiple rows in `y`.
ℹ Row 84 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
# fix dates
df <- df %>% 
  mutate(last_update = as.POSIXct(last_update, tz="UTC",origin="1970-01-01"),
         left_time = as.POSIXct(left_time, tz="UTC",origin="1970-01-01"),
         arrive_time = as.POSIXct(arrive_time, tz="UTC",origin="1970-01-01"))

# Fix typhoon dates
typhoon$date <- as.POSIXct(paste(typhoon$date,typhoon$time), format="%Y%b%d %H%M%S", tz="UTC", origin="1970-01-01")
# Fix typhoon longitude -- they have lon * -1 in the data
typhoon$lon <- typhoon$lon * -1

typhoon_all <- typhoon
# filter to dates in sample
typhoon <- typhoon %>%
  filter(date > as.POSIXct("2018-08-30", tz="UTC"))

df <- df %>% rename(frame=run)

df_all <- df[df$frame != 32,]
df <- df %>% filter(last_update > as.POSIXct("2018-08-30", tz="UTC"))

library(plotly)  # needed for the toRGB() function

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
geo <- list(
  showland = TRUE,
  showlakes = TRUE,
  showcountries = TRUE,
  showocean = TRUE,
  countrywidth = 0.5,
  landcolor = toRGB("grey90"),
  lakecolor = toRGB("aliceblue"),
  oceancolor = toRGB("aliceblue"),
  projection = list(
    type = 'orthographic',  # detailed at https://plot.ly/r/reference/#layout-geo-projection
    rotation = list(
      lon = 100,
      lat = 1,
      roll = 0
    )
  ),
  lonaxis = list(
    showgrid = TRUE,
    gridcolor = toRGB("gray40"),
    gridwidth = 0.5
  ),
  lataxis = list(
    showgrid = TRUE,
    gridcolor = toRGB("gray40"),
    gridwidth = 0.5
  )
)

typhoon_Aug31 <- typhoon[typhoon$date > as.POSIXct("2018-08-31 00:00:00", tz="UTC") & typhoon$date < as.POSIXct("2018-09-01 00:00:00", tz="UTC"),]
typhoon_Jebi <- typhoon_all[typhoon_all$typhoon_name == "JEBI_Y",]
df_Aug31 <- df[df$frame == 1,]
library(plotly)  # for plotting
library(RColorBrewer)  # for colors
# plot with boats, ports, and typhoons
# Note: geo is defined in the appendix -- it controls layout
palette = brewer.pal(8, "Dark2")[c(1,8,3,2)]
p <- plot_geo(colors=palette) %>%
  add_markers(data=df_ports, x = ~port_lon, y = ~port_lat, color = "Port") %>%
  add_markers(data=df_Aug31, x = ~lon, y = ~lat, color = ~ship_type,
              text=~paste('Ship name',shipname)) %>%
  add_markers(data=typhoon_Aug31, x = ~lon, y = ~lat, color="TYPHOON",
              text=~paste("Name", typhoon_name)) %>%
   layout(showlegend = TRUE, geo = geo,
    title = 'Singaporean owned container and tanker ships, August 31, 2018')
p
# plot with boats and typhoons
palette = brewer.pal(8, "Dark2")[c(1,3,2)]
p <- plot_geo(colors=palette) %>%
  add_markers(data=df_all[df_all$frame == 14,], x = ~lon, y = ~lat, color = ~ship_type,
              text=~paste('Ship name',shipname)) %>%
  add_markers(data=typhoon_Jebi, x = ~lon, y = ~lat, color="Typhoon Jebi", text=~paste("Name", typhoon_name, "</br>Time: ", date)) %>%
   layout(
    showlegend = TRUE, geo = geo,
    title = 'Singaporean container/tanker ships, September 4, 2018, evening')
p
library(leaflet)
library(leaflet.extras)

# typhoon icons
ty_icons <- pulseIcons(color='red',  #  supports hex colors like "#C0C0C0"
  heartbeat = 1.2,  #heartbeat is frequency of pulsing
  iconSize=3)

# ship icons
shipicons <- iconList(
  ship = makeIcon("../Figures/ship.png", NULL, 18, 18)
)

leaflet() %>%
  addTiles() %>% 
  setView(lng = 136, lat = 20, zoom=3) %>%
  addCircleMarkers(data=typhoon_Jebi[typhoon_Jebi$intensity_vmax <= 150/1.852,], lng=~lon, lat=~lat,  # Not super typhoon
                   stroke = TRUE, radius=2, color="orange", label=~date) %>%
  addPulseMarkers(data=typhoon_Jebi[typhoon_Jebi$intensity_vmax > 150/1.852,], lng=~lon, lat=~lat,  # Super typhoon
                   icon=ty_icons, label=~date) %>%
  addMarkers(data=df_all[df_all$frame == 14,], lng=~lon, lat=~lat,
             label=~shipname, icon=shipicons)
# Load data
df3 <- read.csv("../../Data/Shipping/combined.csv.gz")

# Determine the main DV: delayed
df3 <- df3 %>%
  arrange(imo, last_update) %>%
  group_by(imo) %>%
  mutate(delayed=ifelse(difftime(arrive_time, lead(arrive_time), units="days") > 1/8 & arrive_time > last_update & left_port == lead(left_port),1,0)) %>%
  ungroup()
Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
# sf can't handle NA values, so we need to filter them out
df3 <- df3 %>% mutate(n = row_number())
df_ty <- df3 %>% filter(!is.na(ty_lat), !is.na(ty_lon))

# Create lists of points on Earth
x <- st_as_sf(df_ty %>% select("lon", "lat"), coords = c("lon", "lat"), 
                 crs = 4326, agr = "constant")
y <- st_as_sf(df_ty %>% select("ty_lon", "ty_lat"), coords = c("ty_lon", "ty_lat"), 
                 crs = 4326, agr = "constant")

# Find the distance between the points
df_ty$dist_typhoon <- st_distance(x, y, by_element=T) / 1000  # distance in km

# Add the distance back to our main data
# Replaced NAs with a placeholder longer than any possible distance
df3 <- df3 %>%
  left_join(df_ty %>% select(n, dist_typhoon)) %>%
  mutate(dist_typhoon = ifelse(is.na(dist_typhoon), 20036, dist_typhoon)) %>%
  select(-n)
Joining with `by = join_by(n)`
df3$typhoon_500 = ifelse(df3$dist_typhoon < 500 & 
                         df3$dist_typhoon >= 0, 1, 0)
df3$typhoon_1000 = ifelse(df3$dist_typhoon < 1000 &
                          df3$dist_typhoon >= 500, 1, 0)
df3$typhoon_2000 = ifelse(df3$dist_typhoon < 2000 &
                          df3$dist_typhoon >= 1000, 1, 0)
fit1 <- glm(delayed ~ typhoon_500 + typhoon_1000 + typhoon_2000, data=df3,
            family=binomial)
summary(fit1)

Call:
glm(formula = delayed ~ typhoon_500 + typhoon_1000 + typhoon_2000, 
    family = binomial, data = df3)

Coefficients:
             Estimate Std. Error  z value Pr(>|z|)    
(Intercept)  -3.65381    0.02931 -124.653   <2e-16 ***
typhoon_500   0.14376    0.16311    0.881   0.3781    
typhoon_1000  0.20682    0.12575    1.645   0.1000    
typhoon_2000  0.15681    0.07116    2.203   0.0276 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 14340  on 59255  degrees of freedom
Residual deviance: 14333  on 59252  degrees of freedom
  (3795 observations deleted due to missingness)
AIC: 14341

Number of Fisher Scoring iterations: 6
odds1 <- exp(coef(fit1))
odds1
 (Intercept)  typhoon_500 typhoon_1000 typhoon_2000 
  0.02589235   1.15460211   1.22975514   1.16976795 
m1 <- margins(fit1)
summary(m1)
       factor    AME     SE      z      p   lower  upper
 typhoon_1000 0.0053 0.0032 1.6435 0.1003 -0.0010 0.0115
 typhoon_2000 0.0040 0.0018 2.2006 0.0278  0.0004 0.0075
  typhoon_500 0.0037 0.0042 0.8811 0.3782 -0.0045 0.0118
# Cut makes a categorical variable out of a numerical variable using specified bins
df3$Super <- ifelse(df3$intensity_vmax * 1.852 > 185, 1, 0)
df3$Moderate <- ifelse(df3$intensity_vmax * 1.852 >= 88 &
                         df3$intensity_vmax * 1.852 < 185, 1, 0)
df3$Weak <- ifelse(df3$intensity_vmax * 1.852 >= 41 & 
                   df3$intensity_vmax * 1.852 < 88, 1, 0)
df3$HK_intensity <- cut(df3$intensity_vmax * 1.852 ,c(-1,41, 62, 87, 117, 149, 999))
table(df3$HK_intensity)

  (-1,41]   (41,62]   (62,87]  (87,117] (117,149] (149,999] 
     3398     12039     12615     11527      2255     21141 
fit2 <- glm(delayed ~ (typhoon_500 + typhoon_1000 + typhoon_2000) : 
              (Weak + Moderate + Super), data=df3,
            family=binomial)
tidy(fit2)
# A tibble: 10 × 5
   term                  estimate std.error statistic p.value
   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)           -3.65       0.0290 -126.     0      
 2 typhoon_500:Weak      -0.00723    0.213    -0.0339 0.973  
 3 typhoon_500:Moderate   0.718      0.251     2.86   0.00417
 4 typhoon_500:Super     -8.91     123.       -0.0726 0.942  
 5 typhoon_1000:Weak      0.251      0.161     1.56   0.120  
 6 typhoon_1000:Moderate  0.120      0.273     0.441  0.659  
 7 typhoon_1000:Super    -0.0191     0.414    -0.0461 0.963  
 8 typhoon_2000:Weak      0.169      0.101     1.66   0.0963 
 9 typhoon_2000:Moderate  0.0260     0.134     0.194  0.846  
10 typhoon_2000:Super     0.316      0.136     2.33   0.0198 
m2 <- margins(fit2)
summary(m2) %>%
  html_df()
factor AME SE z p lower upper
Moderate 0.0007384 0.0006695 1.1028719 0.2700828 -0.0005738 0.0020505
Super -0.0049982 0.0859612 -0.0581445 0.9536336 -0.1734789 0.1634826
typhoon_1000 0.0035770 0.0036204 0.9880155 0.3231451 -0.0035188 0.0106727
typhoon_2000 0.0038115 0.0017865 2.1334568 0.0328873 0.0003100 0.0073130
typhoon_500 -0.0440770 0.6812520 -0.0647000 0.9484129 -1.3793065 1.2911525
Weak 0.0009451 0.0005135 1.8404065 0.0657086 -0.0000614 0.0019515
margins(fit2, at = list(Weak = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()
factor Weak AME SE z p lower upper
typhoon_1000 1 0.0073448 0.0053623 1.3697146 0.1707760 -0.0031651 0.0178547
typhoon_2000 1 0.0063950 0.0031242 2.0469359 0.0406644 0.0002717 0.0125182
typhoon_500 1 -0.0457062 0.7044878 -0.0648786 0.9482707 -1.4264768 1.3350645
margins(fit2, at = list(Moderate = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()
factor Moderate AME SE z p lower upper
typhoon_1000 1 0.0059123 0.0078242 0.7556384 0.4498660 -0.0094229 0.0212474
typhoon_2000 1 0.0043822 0.0039442 1.1110625 0.2665414 -0.0033482 0.0121126
typhoon_500 1 -0.0311833 0.6856382 -0.0454807 0.9637241 -1.3750095 1.3126429
margins(fit2, at = list(Super = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()
factor Super AME SE z p lower upper
typhoon_1000 1 0.0032538 0.0111428 0.2920086 0.7702801 -0.0185858 0.0250934
typhoon_2000 1 0.0102396 0.0041602 2.4613510 0.0138415 0.0020858 0.0183934
typhoon_500 1 -0.2242811 3.1633421 -0.0709000 0.9434773 -6.4243176 5.9757554