Code for Session 4

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)
library(broom)

# Import Walmart data from Session_Project
library(lubridate)
preprocess_data <- function(df) {
  # Merge the data together (Pulled from outside of function -- "scoping")
  df <- df %>%
    inner_join(weekly.stores) %>%
    inner_join(weekly.features[,1:11])
  
  # Compress the weird markdown information to 1 variable
  df <- df %>%
    mutate(markdown = 0,
           markdown = ifelse(!is.na(MarkDown1),MarkDown1,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown2,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown3,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown4,markdown),
           markdown = ifelse(!is.na(MarkDown1),MarkDown5,markdown))
  
  # Fix dates and add useful time variables
  df <- df %>%
    mutate(date = as.Date(Date),
           week = week(date),
           year = year(date),
           month = month(date))

  df
}

weekly <- read_csv("../../Data/WMT_train.csv")
weekly.features <- read_csv("../../Data/WMT_features.csv")
weekly.stores <- read_csv("../../Data/WMT_stores.csv")
df <- preprocess_data(weekly)

df <- df %>%
  group_by(Store, Dept) %>% 
  mutate(store_avg=mean(Weekly_Sales, rm.na=T)) %>%
  ungroup()

# Create the binary variable from Walmart sales data
df <- df %>% mutate(double = ifelse(Weekly_Sales > store_avg*2,1,0))
fit <- glm(double ~ IsHoliday, data=df, family=binomial)
tidy(fit)

# A tibble: 2 × 5
  term          estimate std.error statistic  p.value
  <chr>            <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     -3.45    0.00924    -373.  0       
2 IsHolidayTRUE    0.539   0.0278       19.4 1.09e-83

# Automating the above:
exp(coef(fit))

  (Intercept) IsHolidayTRUE 
   0.03184725    1.71367497

test_data <- data.frame(IsHoliday = c(F,T))
predict(fit, test_data)  # log odds

        1         2 
-3.446804 -2.908164

# probabilities
predict(fit, test_data, type="response")

         1          2 
0.03086431 0.05175146

a <- exp(-3.446804)
b <- exp(-2.908164)
c(a/(1+a), b/(1+b))

[1] 0.03086431 0.05175146

fit2 <- glm(double ~ IsHoliday + Temperature + Fuel_Price, data=df, family=binomial)
summary(fit2)


Call:
glm(formula = double ~ IsHoliday + Temperature + Fuel_Price, 
    family = binomial, data = df)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -1.7764917  0.0673246  -26.39   <2e-16 ***
IsHolidayTRUE  0.3704298  0.0284395   13.03   <2e-16 ***
Temperature   -0.0108268  0.0004698  -23.04   <2e-16 ***
Fuel_Price    -0.3091950  0.0196234  -15.76   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 120370  on 421569  degrees of freedom
Residual deviance: 119146  on 421566  degrees of freedom
AIC: 119154

Number of Fisher Scoring iterations: 6

# Odds
exp(coef(fit2))

  (Intercept) IsHolidayTRUE   Temperature    Fuel_Price 
    0.1692308     1.4483570     0.9892316     0.7340376

# Typical September days
hday_sep    <- mean(predict(fit2, filter(df, IsHoliday, month==9), type="response"))
no_hday_sep <-  mean(predict(fit2, filter(df, !IsHoliday, month==9), type="response"))
# Typical December days
hday_dec    <- mean(predict(fit2, filter(df, IsHoliday, month==12), type="response"))
no_hday_dec <-  mean(predict(fit2, filter(df, !IsHoliday, month==12), type="response"))

html_df(data.frame(Month=c(9,9,12,12), IsHoliday=c(FALSE,TRUE,FALSE,TRUE),
                   Probability=c(no_hday_sep, hday_sep, no_hday_dec, hday_dec)))

Month	IsHoliday	Probability
9	FALSE	0.0266789
9	TRUE	0.0374761
12	FALSE	0.0398377
12	TRUE	0.0586483

percent <- function(x) {
  paste0(round(x*100,2),"%")
}
library(rlang)
# Another use for a quosure :)
# You could always just use quoted arguments here though
extract_margin <- function(m, x) {
  x <- enquo(x)
  percent(filter(summary(m), factor==quo_text(x))$AME)
}

# Calculate AME marginal effects
library(margins)
m <- margins(fit2)
m

 Temperature Fuel_Price IsHoliday
  -0.0003377  -0.009644   0.01334

summary(m) %>%
  html_df()

factor	AME	SE	z	lower	upper
Fuel_Price	-0.0096438	0.0006163	-15.64800	-0.0108517	-0.0084359
IsHoliday	0.0133450	0.0011754	11.35372	0.0110413	0.0156487
Temperature	-0.0003377	0.0000149	-22.71256	-0.0003668	-0.0003085

plot(m, which=summary(m)$factor)

margins(fit2, at = list(IsHoliday = c(TRUE, FALSE)),
        variables = c("Temperature", "Fuel_Price")) %>%
  summary() %>%
  html_df()

factor	IsHoliday	AME	SE	z	lower	upper
Fuel_Price	FALSE	-0.0093401	0.0005989	-15.59617	-0.0105139	-0.0081664
Fuel_Price	TRUE	-0.0131335	0.0008717	-15.06650	-0.0148420	-0.0114250
Temperature	FALSE	-0.0003271	0.0000146	-22.46024	-0.0003556	-0.0002985
Temperature	TRUE	-0.0004599	0.0000210	-21.92927	-0.0005010	-0.0004188

margins(fit2, at = list(Temperature = c(0, 25, 50, 75, 100)),
        variables = c("IsHoliday")) %>%
  summary() %>%
  html_df()

factor	Temperature	AME	SE	z	lower	upper
IsHoliday	0	0.0234484	0.0020168	11.62643	0.0194955	0.0274012
IsHoliday	25	0.0184956	0.0015949	11.59704	0.0153697	0.0216214
IsHoliday	50	0.0144798	0.0012679	11.42060	0.0119948	0.0169648
IsHoliday	75	0.0112693	0.0010161	11.09035	0.0092777	0.0132609
IsHoliday	100	0.0087305	0.0008213	10.62977	0.0071207	0.0103402

# load data
df <- read_csv("../../Data/Shipping/shipping_dataset.csv", na="NA")

Rows: 71460 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): left_type, arrive_type, status
dbl (11): imo, run, lat, lon, last_update, left_time, left_port, arrive_time...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_ships <- read_csv("../../Data/Shipping/index.csv", na="NA")

Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)

Rows: 2512 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): flag, shipname, url, photo, photos link, recognized_next_port, rep...
dbl  (4): imo, lat_of_latest_position, lon_of_latest_position, year_of_build

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_ports <- read_csv("../../Data/Shipping/port_dataset.csv", na="NA")

Rows: 1117 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (3): port, lat, lon

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

typhoon <- read_csv("../../Data/Shipping/typhoon.csv")

Rows: 33419 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): date, time, constrnt, wkng, rpd, scene, est_rmw, mthd, sat, commen...
dbl (13): intensity_ci, intensity_mslp, intensity_vmax, tno_fnl, tno_adj, tn...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# rename lat and lon to avoid conflicts
df_ports <- df_ports %>% rename(port_lat=lat, port_lon=lon)

# Join in ships and ports
df <- df %>% 
  left_join(df_ships) %>%
  filter(!imo == 9528029 | reported_destination == "AU ADL") %>%
  left_join(df_ports, by=c("left_port"="port")) %>%
  rename(left_lat=port_lat, left_lon=port_lon) %>%
  left_join(df_ports, by=c("arrive_port"="port")) %>%
  rename(arrive_lat=port_lat, arrive_lon=port_lon)

Joining with `by = join_by(imo)`

Warning in left_join(., df_ships): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 847 of `x` matches multiple rows in `y`.
ℹ Row 84 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

# fix dates
df <- df %>% 
  mutate(last_update = as.POSIXct(last_update, tz="UTC",origin="1970-01-01"),
         left_time = as.POSIXct(left_time, tz="UTC",origin="1970-01-01"),
         arrive_time = as.POSIXct(arrive_time, tz="UTC",origin="1970-01-01"))

# Fix typhoon dates
typhoon$date <- as.POSIXct(paste(typhoon$date,typhoon$time), format="%Y%b%d %H%M%S", tz="UTC", origin="1970-01-01")
# Fix typhoon longitude -- they have lon * -1 in the data
typhoon$lon <- typhoon$lon * -1

typhoon_all <- typhoon
# filter to dates in sample
typhoon <- typhoon %>%
  filter(date > as.POSIXct("2018-08-30", tz="UTC"))

df <- df %>% rename(frame=run)

df_all <- df[df$frame != 32,]
df <- df %>% filter(last_update > as.POSIXct("2018-08-30", tz="UTC"))

library(plotly)  # needed for the toRGB() function


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

geo <- list(
  showland = TRUE,
  showlakes = TRUE,
  showcountries = TRUE,
  showocean = TRUE,
  countrywidth = 0.5,
  landcolor = toRGB("grey90"),
  lakecolor = toRGB("aliceblue"),
  oceancolor = toRGB("aliceblue"),
  projection = list(
    type = 'orthographic',  # detailed at https://plot.ly/r/reference/#layout-geo-projection
    rotation = list(
      lon = 100,
      lat = 1,
      roll = 0
    )
  ),
  lonaxis = list(
    showgrid = TRUE,
    gridcolor = toRGB("gray40"),
    gridwidth = 0.5
  ),
  lataxis = list(
    showgrid = TRUE,
    gridcolor = toRGB("gray40"),
    gridwidth = 0.5
  )
)

typhoon_Aug31 <- typhoon[typhoon$date > as.POSIXct("2018-08-31 00:00:00", tz="UTC") & typhoon$date < as.POSIXct("2018-09-01 00:00:00", tz="UTC"),]
typhoon_Jebi <- typhoon_all[typhoon_all$typhoon_name == "JEBI_Y",]
df_Aug31 <- df[df$frame == 1,]

library(plotly)  # for plotting
library(RColorBrewer)  # for colors
# plot with boats, ports, and typhoons
# Note: geo is defined in the appendix -- it controls layout
palette = brewer.pal(8, "Dark2")[c(1,8,3,2)]
p <- plot_geo(colors=palette) %>%
  add_markers(data=df_ports, x = ~port_lon, y = ~port_lat, color = "Port") %>%
  add_markers(data=df_Aug31, x = ~lon, y = ~lat, color = ~ship_type,
              text=~paste('Ship name',shipname)) %>%
  add_markers(data=typhoon_Aug31, x = ~lon, y = ~lat, color="TYPHOON",
              text=~paste("Name", typhoon_name)) %>%
   layout(showlegend = TRUE, geo = geo,
    title = 'Singaporean owned container and tanker ships, August 31, 2018')
p

# plot with boats and typhoons
palette = brewer.pal(8, "Dark2")[c(1,3,2)]
p <- plot_geo(colors=palette) %>%
  add_markers(data=df_all[df_all$frame == 14,], x = ~lon, y = ~lat, color = ~ship_type,
              text=~paste('Ship name',shipname)) %>%
  add_markers(data=typhoon_Jebi, x = ~lon, y = ~lat, color="Typhoon Jebi", text=~paste("Name", typhoon_name, "</br>Time: ", date)) %>%
   layout(
    showlegend = TRUE, geo = geo,
    title = 'Singaporean container/tanker ships, September 4, 2018, evening')
p

library(leaflet)
library(leaflet.extras)

# typhoon icons
ty_icons <- pulseIcons(color='red',  #  supports hex colors like "#C0C0C0"
  heartbeat = 1.2,  #heartbeat is frequency of pulsing
  iconSize=3)

# ship icons
shipicons <- iconList(
  ship = makeIcon("../Figures/ship.png", NULL, 18, 18)
)

leaflet() %>%
  addTiles() %>% 
  setView(lng = 136, lat = 20, zoom=3) %>%
  addCircleMarkers(data=typhoon_Jebi[typhoon_Jebi$intensity_vmax <= 150/1.852,], lng=~lon, lat=~lat,  # Not super typhoon
                   stroke = TRUE, radius=2, color="orange", label=~date) %>%
  addPulseMarkers(data=typhoon_Jebi[typhoon_Jebi$intensity_vmax > 150/1.852,], lng=~lon, lat=~lat,  # Super typhoon
                   icon=ty_icons, label=~date) %>%
  addMarkers(data=df_all[df_all$frame == 14,], lng=~lon, lat=~lat,
             label=~shipname, icon=shipicons)

# Load data
df3 <- read.csv("../../Data/Shipping/combined.csv.gz")

# Determine the main DV: delayed
df3 <- df3 %>%
  arrange(imo, last_update) %>%
  group_by(imo) %>%
  mutate(delayed=ifelse(difftime(arrive_time, lead(arrive_time), units="days") > 1/8 & arrive_time > last_update & left_port == lead(left_port),1,0)) %>%
  ungroup()

library(sf)

Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE

# sf can't handle NA values, so we need to filter them out
df3 <- df3 %>% mutate(n = row_number())
df_ty <- df3 %>% filter(!is.na(ty_lat), !is.na(ty_lon))

# Create lists of points on Earth
x <- st_as_sf(df_ty %>% select("lon", "lat"), coords = c("lon", "lat"), 
                 crs = 4326, agr = "constant")
y <- st_as_sf(df_ty %>% select("ty_lon", "ty_lat"), coords = c("ty_lon", "ty_lat"), 
                 crs = 4326, agr = "constant")

# Find the distance between the points
df_ty$dist_typhoon <- st_distance(x, y, by_element=T) / 1000  # distance in km

# Add the distance back to our main data
# Replaced NAs with a placeholder longer than any possible distance
df3 <- df3 %>%
  left_join(df_ty %>% select(n, dist_typhoon)) %>%
  mutate(dist_typhoon = ifelse(is.na(dist_typhoon), 20036, dist_typhoon)) %>%
  select(-n)

Joining with `by = join_by(n)`

df3$typhoon_500 = ifelse(df3$dist_typhoon < 500 & 
                         df3$dist_typhoon >= 0, 1, 0)
df3$typhoon_1000 = ifelse(df3$dist_typhoon < 1000 &
                          df3$dist_typhoon >= 500, 1, 0)
df3$typhoon_2000 = ifelse(df3$dist_typhoon < 2000 &
                          df3$dist_typhoon >= 1000, 1, 0)

fit1 <- glm(delayed ~ typhoon_500 + typhoon_1000 + typhoon_2000, data=df3,
            family=binomial)
summary(fit1)


Call:
glm(formula = delayed ~ typhoon_500 + typhoon_1000 + typhoon_2000, 
    family = binomial, data = df3)

Coefficients:
             Estimate Std. Error  z value Pr(>|z|)    
(Intercept)  -3.65381    0.02931 -124.653   <2e-16 ***
typhoon_500   0.14376    0.16311    0.881   0.3781    
typhoon_1000  0.20682    0.12575    1.645   0.1000    
typhoon_2000  0.15681    0.07116    2.203   0.0276 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 14340  on 59255  degrees of freedom
Residual deviance: 14333  on 59252  degrees of freedom
  (3795 observations deleted due to missingness)
AIC: 14341

Number of Fisher Scoring iterations: 6

odds1 <- exp(coef(fit1))
odds1

 (Intercept)  typhoon_500 typhoon_1000 typhoon_2000 
  0.02589235   1.15460211   1.22975514   1.16976795

m1 <- margins(fit1)
summary(m1)

       factor    AME     SE      z      p   lower  upper
 typhoon_1000 0.0053 0.0032 1.6435 0.1003 -0.0010 0.0115
 typhoon_2000 0.0040 0.0018 2.2006 0.0278  0.0004 0.0075
  typhoon_500 0.0037 0.0042 0.8811 0.3782 -0.0045 0.0118

# Cut makes a categorical variable out of a numerical variable using specified bins
df3$Super <- ifelse(df3$intensity_vmax * 1.852 > 185, 1, 0)
df3$Moderate <- ifelse(df3$intensity_vmax * 1.852 >= 88 &
                         df3$intensity_vmax * 1.852 < 185, 1, 0)
df3$Weak <- ifelse(df3$intensity_vmax * 1.852 >= 41 & 
                   df3$intensity_vmax * 1.852 < 88, 1, 0)
df3$HK_intensity <- cut(df3$intensity_vmax * 1.852 ,c(-1,41, 62, 87, 117, 149, 999))
table(df3$HK_intensity)


  (-1,41]   (41,62]   (62,87]  (87,117] (117,149] (149,999] 
     3398     12039     12615     11527      2255     21141

fit2 <- glm(delayed ~ (typhoon_500 + typhoon_1000 + typhoon_2000) : 
              (Weak + Moderate + Super), data=df3,
            family=binomial)
tidy(fit2)

# A tibble: 10 × 5
   term                  estimate std.error statistic p.value
   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)           -3.65       0.0290 -126.     0      
 2 typhoon_500:Weak      -0.00723    0.213    -0.0339 0.973  
 3 typhoon_500:Moderate   0.718      0.251     2.86   0.00417
 4 typhoon_500:Super     -8.91     123.       -0.0726 0.942  
 5 typhoon_1000:Weak      0.251      0.161     1.56   0.120  
 6 typhoon_1000:Moderate  0.120      0.273     0.441  0.659  
 7 typhoon_1000:Super    -0.0191     0.414    -0.0461 0.963  
 8 typhoon_2000:Weak      0.169      0.101     1.66   0.0963 
 9 typhoon_2000:Moderate  0.0260     0.134     0.194  0.846  
10 typhoon_2000:Super     0.316      0.136     2.33   0.0198

m2 <- margins(fit2)
summary(m2) %>%
  html_df()

factor	AME	SE	z	p	lower	upper
Moderate	0.0007384	0.0006695	1.1028719	0.2700828	-0.0005738	0.0020505
Super	-0.0049982	0.0859612	-0.0581445	0.9536336	-0.1734789	0.1634826
typhoon_1000	0.0035770	0.0036204	0.9880155	0.3231451	-0.0035188	0.0106727
typhoon_2000	0.0038115	0.0017865	2.1334568	0.0328873	0.0003100	0.0073130
typhoon_500	-0.0440770	0.6812520	-0.0647000	0.9484129	-1.3793065	1.2911525
Weak	0.0009451	0.0005135	1.8404065	0.0657086	-0.0000614	0.0019515

margins(fit2, at = list(Weak = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()

factor	Weak	AME	SE	z	p	lower	upper
typhoon_1000	1	0.0073448	0.0053623	1.3697146	0.1707760	-0.0031651	0.0178547
typhoon_2000	1	0.0063950	0.0031242	2.0469359	0.0406644	0.0002717	0.0125182
typhoon_500	1	-0.0457062	0.7044878	-0.0648786	0.9482707	-1.4264768	1.3350645

margins(fit2, at = list(Moderate = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()

factor	Moderate	AME	SE	z	p	lower	upper
typhoon_1000	1	0.0059123	0.0078242	0.7556384	0.4498660	-0.0094229	0.0212474
typhoon_2000	1	0.0043822	0.0039442	1.1110625	0.2665414	-0.0033482	0.0121126
typhoon_500	1	-0.0311833	0.6856382	-0.0454807	0.9637241	-1.3750095	1.3126429

margins(fit2, at = list(Super = c(1)),
        variables = c("typhoon_500", "typhoon_1000", "typhoon_2000")) %>%
  summary() %>%
  html_df()

factor	Super	AME	SE	z	p	lower	upper
typhoon_1000	1	0.0032538	0.0111428	0.2920086	0.7702801	-0.0185858	0.0250934
typhoon_2000	1	0.0102396	0.0041602	2.4613510	0.0138415	0.0020858	0.0183934
typhoon_500	1	-0.2242811	3.1633421	-0.0709000	0.9434773	-6.4243176	5.9757554