library(knitr)
library(kableExtra)
#' Pretty formatted HTML tables
#'
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
if(!length(names)) {
names=colnames(df)
}
kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
{if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
{if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
{if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
{if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}
library(tidyverse)
Code for Session 9
Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
df <- readRDS('../../Data/Session_6_models.rds')
df2 <- df %>% mutate(AAER = ifelse(AAER==0,0,1))
library(xgboost)
# Prep data
train_x <- model.matrix(AAER ~ ., data=df2[df2$Test==0,-1])[,-1]
train_y <- model.frame(AAER ~ ., data=df2[df2$Test==0,])[,"AAER"]
test_x <- model.matrix(AAER ~ ., data=df2[df2$Test==1,-1])[,-1]
test_y <- model.frame(AAER ~ ., data=df2[df2$Test==1,])[,"AAER"]
set.seed(468435) #for reproducibility
xgbCV <- xgb.cv(max_depth=5, eta=0.10, gamma=5, min_child_weight = 4,
subsample = 0.5, objective = "binary:logistic", data=train_x,
label=train_y, nrounds=100, eval_metric="auc", nfold=10,
stratified=TRUE, verbose=0)
fit_ens <- xgboost(params=xgbCV$params, data = train_x, label = train_y,
nrounds = which.max(xgbCV$evaluation_log$test_auc_mean),
verbose = 0)
library(yardstick)
all_x <- model.matrix(AAER ~ ., data=df[,-1])[,-1]
df$pred_ens <- predict(fit_ens, all_x, type="response")
df_test <- df %>% filter(Test==1)
aucs <- c(df_test %>% roc_auc(AAER, pred_ens, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(AAER, pred_BCE, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(AAER, pred_lmin, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(AAER, pred_xgb, event_level='second') %>% pull(.estimate))
names(aucs) <- c("Ensemble", "Logit (BCE)", "Lasso (lambda.min)", "XGBoost")
curve_out_ens <- df_test %>% roc_curve(AAER, pred_ens, event_level='second')
curve_out_BCE <- df_test %>% roc_curve(AAER, pred_BCE, event_level='second')
curve_out_lmin <- df_test %>% roc_curve(AAER, pred_lmin, event_level='second')
curve_out_xgb <- df_test %>% roc_curve(AAER, pred_xgb, event_level='second')
ggplot() +
geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) +
geom_line(data=curve_out_lmin, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.min")) +
geom_line(data=curve_out_xgb, aes(y=sensitivity, x=1-specificity, color="XGBoost")) +
geom_line(data=curve_out_ens, aes(y=sensitivity, x=1-specificity, color="Ensemble")) +
geom_abline(slope=1)
aucs # Out of sample
Ensemble Logit (BCE) Lasso (lambda.min) XGBoost
0.8169036 0.7599594 0.7290185 0.8083503
xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit_ens)
# Variable importance
xgb.plot.importance(imp)
library(gender)
df <- read_csv('../../Data/Session_9-City_of_Chicago_Salary_2019.04.csv')
names(df) <- make.names(names(df),unique = TRUE)
df <- df %>%
mutate(Full.Time = ifelse(Full.or.Part.Time == 'F',1 ,0),
Salaried = ifelse(Salary.or.Hourly == 'Salary', 1, 0),
Salary = ifelse(is.na(Annual.Salary), Typical.Hours * Hourly.Rate, Annual.Salary),
Obs = n()) %>%
group_by(Job.Titles) %>%
mutate(Job.Titles.Freq = n()/Obs) %>%
ungroup() %>%
group_by(Department) %>%
mutate(Department.Freq = n()/Obs) %>%
ungroup() %>%
mutate(Job.Titles = ifelse(Job.Titles.Freq > 0.03, Job.Titles, 'Other'),
Department = ifelse(Department.Freq > 0.05, Department, 'Other'),
Job.Titles = factor(Job.Titles),
Department = factor(Department),
first_name = str_extract(df$Name, '(?<=,[:space:]{2})[:alpha:]+'))
genders <- gender(df$first_name) %>% distinct(name, .keep_all = T)
genders <- genders %>%
mutate(Female = ifelse(proportion_female > proportion_male,1,0))
df <- left_join(df, genders[, c('name', 'Female')], by = c("first_name" = "name"))
train_x <- model.matrix(Salary ~ Job.Titles + Department + Full.Time + Salaried + Female, data=df)[,-1]
train_y <- model.frame(Salary ~ Job.Titles + Department + Full.Time + Salaried + Female, data=df)[, "Salary"]
set.seed(654687) #for reproducibility
xgbCV <- xgb.cv(max_depth=6, eta=0.30, gamma=0, min_child_weight = 1,
subsample = 1, booster = 'gbtree', data=train_x,
label=train_y, nrounds=100, eval_metric="rmse", nfold=10,
verbose=0)
fit_xgb <- xgboost(params=xgbCV$params, data = train_x, label = train_y,
nrounds = which.min(xgbCV$evaluation_log$test_rmse_mean))
[00:04:17] WARNING: src/learner.cc:767:
Parameters: { "silent" } are not used.
[1] train-rmse:56645.906707
[2] train-rmse:41397.048662
[3] train-rmse:31312.967000
[4] train-rmse:24913.566772
[5] train-rmse:21071.604695
[6] train-rmse:18900.317450
[7] train-rmse:17736.179105
[8] train-rmse:17133.908412
[9] train-rmse:16830.176110
[10] train-rmse:16678.446700
[11] train-rmse:16597.902839
[12] train-rmse:16559.520230
[13] train-rmse:16528.930025
[14] train-rmse:16515.575900
[15] train-rmse:16508.920316
[16] train-rmse:16506.080438
[17] train-rmse:16501.078483
[18] train-rmse:16497.987885
[19] train-rmse:16496.903970
[20] train-rmse:16496.268742
[21] train-rmse:16493.647143
[22] train-rmse:16492.447976
[23] train-rmse:16491.671073
[24] train-rmse:16490.640930
[25] train-rmse:16490.049645
[26] train-rmse:16489.608232
[27] train-rmse:16489.253997
[28] train-rmse:16489.050795
[29] train-rmse:16488.901063
[30] train-rmse:16488.769350
[31] train-rmse:16488.690556
[32] train-rmse:16488.645886
[33] train-rmse:16488.602285
[34] train-rmse:16488.563321
[35] train-rmse:16488.546512
[36] train-rmse:16488.529413
[37] train-rmse:16488.517873
[38] train-rmse:16488.509465
[39] train-rmse:16488.504471
[40] train-rmse:16488.499301
[41] train-rmse:16488.497087
[42] train-rmse:16488.493261
library('SHAPforxgboost')
#https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
vals <- shap.values(xgb_model = fit_xgb, X_train = train_x)
vals$mean_shap_score
Salaried
30789.37053
Job.TitlesPOLICE OFFICER
3391.77843
DepartmentOther
2263.00574
Female
2223.51487
DepartmentOEMC
672.56147
Job.TitlesOther
668.91992
Job.TitlesSERGEANT
581.33778
Full.Time
440.57020
DepartmentPOLICE
433.90578
DepartmentSTREETS & SAN
327.90918
Job.TitlesPOLICE OFFICER (ASSIGNED AS DETECTIVE)
136.00854
DepartmentWATER MGMNT
83.54228
Job.TitlesMOTOR TRUCK DRIVER
22.31799
Job.TitlesMOTOR TRUCK DRIVER
-46.47701
Job.TitlesOther
693.06793
Job.TitlesPOLICE OFFICER
-669.98481
Job.TitlesPOLICE OFFICER (ASSIGNED AS DETECTIVE)
-35.29269
Job.TitlesSERGEANT
-204.21080
DepartmentOEMC
-442.22006
DepartmentOther
-1071.70237
DepartmentPOLICE
280.49432
DepartmentSTREETS & SAN
-69.52624
DepartmentWATER MGMNT
81.47802
Full.Time
-1036.66089
Salaried
1548.70929
Female
-3724.62307
Job.TitlesMOTOR TRUCK DRIVER
-2.730796
Job.TitlesOther
232.434377
Job.TitlesPOLICE OFFICER
-158.892689
Job.TitlesPOLICE OFFICER (ASSIGNED AS DETECTIVE)
-9.971978
Job.TitlesSERGEANT
-88.875904
DepartmentOEMC
163.309598
DepartmentOther
289.308380
DepartmentPOLICE
105.864365
DepartmentSTREETS & SAN
-77.322399
DepartmentWATER MGMNT
12.078450
Full.Time
-11.665192
Salaried
-34.720889
Female
1583.569435
shap_long <- shap.prep(xgb_model = fit_xgb, X_train = train_x)
shap.plot.summary(shap_long)
shap.plot.dependence(data_long = shap_long, x = 'Female', y = 'Female',
color_feature = 'Job.TitlesPOLICE OFFICER', jitter_width = 0.05)
`geom_smooth()` using formula = 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at -0.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.005
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 2.6856e-26
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1.01
plot_data <- shap.prep.stack.data(shap_contrib = vals$shap_score,
top_n = 5, n_groups = 6)
shap.plot.force_plot_bygroup(plot_data)
g5
g6
g3
g4
g2
g1