Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
library(tidyverse)
```r
```r
df <- readRDS('../../Data/Session_6_models.rds')
library(xgboost)
# Prep data
train_x <- model.matrix(AAER ~ ., data=df[df$Test==0,-1])[,-1]
train_y <- model.frame(AAER ~ ., data=df[df$Test==0,])[,\AAER\]
test_x <- model.matrix(AAER ~ ., data=df[df$Test==1,-1])[,-1]
test_y <- model.frame(AAER ~ ., data=df[df$Test==1,])[,\AAER\]
set.seed(468435) #for reproducibility
xgbCV <- xgb.cv(max_depth=5, eta=0.10, gamma=5, min_child_weight = 4,
subsample = 0.57, objective = \binary:logistic\, data=train_x,
label=train_y, nrounds=100, eval_metric=\auc\, nfold=10,
stratified=TRUE, verbose=0)
fit_ens <- xgboost(params=xgbCV$params, data = train_x, label = train_y,
nrounds = which.max(xgbCV$evaluation_log$test_auc_mean),
verbose = 0)
library(yardstick)
all_x <- model.matrix(AAER ~ ., data=df[,-1])[,-1]
df$pred_ens <- predict(fit_ens, all_x, type="response")
df_test <- df %>% filter(Test==1)
aucs <- c(df_test %>% roc_auc(truth=AAER, estimate=pred_ens, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(truth=AAER, estimate=pred_BCE, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(truth=AAER, estimate=pred_lmin, event_level='second') %>% pull(.estimate),
df_test %>% roc_auc(truth=AAER, estimate=pred_xgb, event_level='second') %>% pull(.estimate))
names(aucs) <- c("Ensemble", "Logit (BCE)", "Lasso (lambda.min)", "XGBoost")
curve_out_ens <- df_test %>% roc_curve(truth=AAER, estimate=pred_ens, event_level='second')
curve_out_BCE <- df_test %>% roc_curve(truth=AAER, estimate=pred_BCE, event_level='second')
curve_out_lmin <- df_test %>% roc_curve(truth=AAER, estimate=pred_lmin, event_level='second')
curve_out_xgb <- df_test %>% roc_curve(truth=AAER, estimate=pred_xgb, event_level='second')
ggplot() +
geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) +
geom_line(data=curve_out_lmin, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.min")) +
geom_line(data=curve_out_xgb, aes(y=sensitivity, x=1-specificity, color="XGBoost")) +
geom_line(data=curve_out_ens, aes(y=sensitivity, x=1-specificity, color="Ensemble")) +
geom_abline(slope=1)
aucs # Out of sample
Ensemble Logit (BCE) Lasso (lambda.min) XGBoost
0.8169036 0.7599594 0.7290185 0.8083503
xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit_ens)
# Variable importance
xgb.plot.importance(imp)
library(gender)
PLEASE NOTE: The method provided by this package must be used cautiously
and responsibly. Please be sure to see the guidelines and warnings about
usage in the README or the package documentation.
df <- read_csv('../../Data/City_of_Chicago_Salary_2019.04.csv')
-- Column specification ---------------------------------------------------------------------------------------------------------------------------------------
cols(
Name = col_character(),
`Job Titles` = col_character(),
Department = col_character(),
`Full or Part-Time` = col_character(),
`Salary or Hourly` = col_character(),
`Typical Hours` = col_double(),
`Annual Salary` = col_double(),
`Hourly Rate` = col_double()
)
names(df) <- make.names(names(df),unique = TRUE)
df <- df %>%
mutate(Full.Time = ifelse(Full.or.Part.Time == 'F',1 ,0),
Salaried = ifelse(Salary.or.Hourly == 'Salary', 1, 0),
Salary = ifelse(is.na(Annual.Salary), Typical.Hours * Hourly.Rate, Annual.Salary),
Obs = n()) %>%
group_by(Job.Titles) %>%
mutate(Job.Titles.Freq = n()/Obs) %>%
ungroup() %>%
group_by(Department) %>%
mutate(Department.Freq = n()/Obs) %>%
ungroup() %>%
mutate(Job.Titles = ifelse(Job.Titles.Freq > 0.03, Job.Titles, 'Other'),
Department = ifelse(Department.Freq > 0.05, Department, 'Other'),
Job.Titles = factor(Job.Titles),
Department = factor(Department),
first_name = str_extract(df$Name, '(?<=,[:space:]{2})[:alpha:]+'))
genders <- gender(df$first_name) %>% distinct(name, .keep_all = T)
genders <- genders %>%
mutate(Female = ifelse(proportion_female > proportion_male,1,0))
df <- left_join(df, genders[, c('name', 'Female')], by = c("first_name" = "name"))
train_x <- model.matrix(Salary ~ Job.Titles + Department + Full.Time + Salaried + Female, data=df)[,-1]
train_y <- model.frame(Salary ~ Job.Titles + Department + Full.Time + Salaried + Female, data=df)[, "Salary"]
set.seed(654687) #for reproducibility
xgbCV <- xgb.cv(max_depth=6, eta=0.30, gamma=0, min_child_weight = 1,
subsample = 1, booster = 'gblinear', data=train_x,
label=train_y, nrounds=100, eval_metric="rmse", nfold=10,
verbose=0)
fit_xgb <- xgboost(params=xgbCV$params, data = train_x, label = train_y,
nrounds = which.max(xgbCV$evaluation_log$test_rmse_mean))
[00:44:27] WARNING: amalgamation/../src/learner.cc:573:
Parameters: { "gamma", "max_depth", "min_child_weight", "silent", "subsample" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[1] train-rmse:33171.421875
library('SHAPforxgboost')
#https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/
shap_values <- shap.values(xgb_model = fit_xgb, X_train = train_x)
shap_values$mean_shap_score
Salaried Full.Time DepartmentPOLICE
15541.90119 14542.46170 8310.01701
Job.TitlesOther Job.TitlesPOLICE OFFICER Female
6298.13637 5415.61501 3823.13352
DepartmentOther Job.TitlesSERGEANT Job.TitlesPOLICE OFFICER (ASSIGNED AS DETECTIVE)
2477.28417 1021.78531 773.86084
Job.TitlesMOTOR TRUCK DRIVER DepartmentOEMC DepartmentSTREETS & SAN
259.84506 175.65395 89.93510
DepartmentWATER MGMNT
29.38586
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train=train_x)
shap.plot.summary(shap_long)
plot_data <- shap.prep.stack.data(shap_contrib = shap_values$shap_score,
top_n = 5, n_groups = 6)
The SHAP values of the Rest 8 features were summed into variable 'rest_variables'.
shap.plot.force_plot_bygroup(plot_data)