Code
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(parsnip)
library(doFuture)
library(doParallel)
library(xgboost)
library(vip)
library(haven)
library(here)
library(icpsrdata)
library(plotly)
dir.create("data")library(tidyverse)
library(tidymodels)
library(ggplot2)
library(parsnip)
library(doFuture)
library(doParallel)
library(xgboost)
library(vip)
library(haven)
library(here)
library(icpsrdata)
library(plotly)
dir.create("data")# crime predictive data takes 1-2 mins
icpsr_download(
file_id = 38649,
download_dir = here("data")
)
# crime implementation crime data takes 1-2 mins
icpsr_download(
file_id = 37059,
download_dir = here("data")
)
# Ses data takes 9-10 mins
icpsr_download(
file_id = 38528,
download_dir = here("data")
)
# school counts takes 1-2 mins
icpsr_download(
file_id = 38569,
download_dir = here("data")
)
# street connectivity takes 1-2 mins
icpsr_download(
file_id = 38580,
download_dir = here("data")
)
# pollution takes 1 min
icpsr_download(
file_id = 38597,
download_dir = here("data")
)
### land cover takes 1-2 mins
icpsr_download(
file_id = 38598,
download_dir = here("data")
)# crime modeling data
crime_predictive <-
read_dta(
here("data",
"ICPSR_38649",
"DS0001",
"38649-0001-Data.dta"))
# crime implementation data
crime_implementation <-
read_dta(
here("data",
"ICPSR_37059",
"DS0004",
"37059-0004-Data.dta"))
# ses predictive data and implementation data
ses08_17 <-
read_dta(
here("data",
"ICPSR_38528",
"DS0002",
"38528-0002-Data.dta"))
# school counts data and implementation data
schoolct <-
read_dta(
here("data",
"ICPSR_38569",
"DS0001",
"38569-0001-Data.dta"))
# street connectivity predictive data
streetcon_predictive <-
read_dta(
here("data",
"ICPSR_38580",
"DS0001",
"38580-0001-Data.dta"))
# street connectivity implementation data
streetcon_implementation <-
read_dta(
here("data",
"ICPSR_38580",
"DS0003",
"38580-0003-Data.dta"))
# pollution predictive data and implementation data
pollution <-
read_dta(
here("data",
"ICPSR_38597",
"DS0001",
"38597-0001-Data.dta"))
# land cover predictive data and implementation data
landcover <-
read_dta(
here("data",
"ICPSR_38598",
"DS0001",
"38598-0001-Data.dta"))We use annual measure of crime.
crime_2010 <-
crime_predictive|>
rename_all(tolower)|>
rename(county = stcofips)|>
select(county, year, viol, property, cpopcrim)|>
filter(year == 2010)|>
# generate crime rate per 100,000 people and the log transform
mutate(viol_rate = (viol / cpopcrim) * 100000,
property_rate = (property / cpopcrim) * 100000)|>
mutate(log_viol = log(viol_rate),
log_property = log(property_rate))crime_2016 <-
crime_implementation|>
rename_all(tolower)|>
select(fips_st, fips_cty, cpopcrim, viol, property)|>
# concatenate fips_st and fips_cty to make county fips
mutate(
fips_st = ifelse(fips_st >= 1 & fips_st <= 9,
paste0("0", fips_st),
as.character(fips_st))
)|>
mutate(
fips_cty = str_pad(fips_cty,
width = 3,
pad = "0")
)|>
mutate(county = str_c(fips_st, fips_cty))|>
select(-fips_cty, -fips_st)|>
# generate crime rate per 100,000 people and the log transform
mutate(viol_rate = (viol / cpopcrim) * 100000,
property_rate = (property / cpopcrim) * 100000)|>
mutate(log_viol = log(viol_rate),
log_property = log(property_rate))|>
select(county, everything())# create a function for effective data cleaning
clean_data<-
function(data) {
cleaned_data <- data |>
rename_all(tolower)|>
mutate(
county = substr((tract_fips10), 1, 5))|>
select(-tract_fips10)
return(cleaned_data)
}SES predictive data of 2010 is from ACS five-year estimate from 2008 to 2012. SES implementation data of 2016 is from ACS five-years estimate from 2013 to 2016.
# ACS 2008 to 2012 for year 2010
ses_2010 <-
clean_data(ses08_17)|>
group_by(county)|>
summarise_all(sum, na.rm = TRUE)|>
pivot_longer(
cols = !county,
names_to = "indicator",
values_to = "value"
)|>
filter(str_sub(indicator, -2) == "12")|>
# pivot SES indicators back as variables
pivot_wider(
names_from = indicator,
values_from = value
)|>
# cpopcrim and totpop08_12 are all total population estimate.
# cpopcrim is from crime dataset, totpop08_12 is from ses dataset
# since we are predicting crime rate, it would be more appropriate to use
# the population variable in the crime dataset, thus we will use cpopcrim
# as population estimate
select(-totpop08_12)
# ACS 2013 to 2017 for year 2016
ses_2016 <-
clean_data(ses08_17)|>
group_by(county)|>
summarise_all(sum, na.rm = TRUE)|>
pivot_longer(
cols = !county,
names_to = "indicator",
values_to = "value"
)|>
filter(str_sub(indicator, -2) == "17")|>
# pivot SES indicators back as variables
pivot_wider(
names_from = indicator,
values_from = value
)|>
select(-totpop13_17)We use annual school counts.
# school counts at 2016
schoolct_2010 <-
clean_data(schoolct)|>
group_by(county,year)|>
summarise_all(sum, na.rm = TRUE)|>
filter(year == 2010)
schoolct_2016 <-
clean_data(schoolct)|>
group_by(county,year)|>
summarise_all(sum, na.rm = TRUE)|>
filter(year == 2016)We use street connectivity data at 2010 for predictive data, and street connectivity data at 2020 for implementation data.
# from 2010 school count data
streetcon_2010 <-
clean_data(streetcon_predictive)|>
group_by(county)|>
summarise_all(sum, na.rm = TRUE)
# use 2020 data to represent 2016 school counts
streetcon_2016 <-
streetcon_implementation|>
rename_all(tolower)|>
mutate(county = substr((tract_fips20), 1, 5))|>
select(-tract_fips20)|>
group_by(county)|>
summarise_all(sum, na.rm = TRUE)We use annual measure of pollution.
# pollution in 2010
pollution_2010 <-
clean_data(pollution)|>
group_by(county,year)|>
summarise_all(sum, na.rm = TRUE)|>
filter(year == 2010)
# pollution in 2016
pollution_2016 <-
clean_data(pollution)|>
group_by(county,year)|>
summarise_all(sum, na.rm = TRUE)|>
filter(year == 2016)We use annual interpolation of land cover.
# land cover at 2010
landcover_2010 <-
landcover|>
rename_all(tolower)|>
mutate(county = substr((tract_fips), 1, 5))|>
select(-tract_fips, -intp_flag)|>
group_by(county, year_intp)|>
summarise_all(sum, na.rm = TRUE) |>
filter(year_intp == 2010)
# land cover at 2016
landcover_2016 <- landcover|>
rename_all(tolower)|>
mutate(county = substr((tract_fips), 1, 5))|>
select(-tract_fips, -intp_flag)|>
group_by(county, year_intp)|>
summarise_all(sum, na.rm = TRUE) |>
filter(year_intp == 2016)# combine the data sets with the by "county" column
crime_all_2010 <-
full_join(
crime_2010,
full_join(
ses_2010,
full_join(
schoolct_2010,
full_join(
streetcon_2010,
full_join(
pollution_2010, landcover_2010, by = "county"
),
by = "county"
),
by = "county"
),
by = "county"
),
by = "county"
)
crime_all_2010<-crime_all_2010|>
select(-year.y, -year.x, -year_intp)# combine the data sets with the by "county" and "year" columns
crime_all_2016 <-
full_join(
crime_2016,
full_join(
ses_2016,
full_join(
schoolct_2016,
full_join(
streetcon_2016,
full_join(
pollution_2016, landcover_2010, by = "county"
),
by = "county"
),
by = "county"
),
by = "county"
),
by = "county"
)
crime_all_2016 <-
crime_all_2016|>
select(-year.y, -year.x, -year_intp)|>
select(county, everything())|>
mutate(year = 2016)We then removed STATA label and attributes in outcome variables.
# remove STATA format and attributes
attr(crime_all_2010$viol, "format.stata") <- NULL
attr(crime_all_2010$property, "format.stata") <- NULL
attr(crime_all_2010$cpopcrim, "format.stata") <- NULL
attr(crime_all_2010$viol_rate, "format.stata") <- NULL
attr(crime_all_2010$property_rate, "format.stata") <- NULL
attr(crime_all_2010$log_viol, "format.stata") <- NULL
attr(crime_all_2010$log_property, "format.stata") <- NULL
attr(crime_all_2010$viol, "label") <- NULL
attr(crime_all_2010$property, "label") <- NULL
attr(crime_all_2010$cpopcrim, "label") <- NULL
attr(crime_all_2010$viol_rate, "label") <- NULL
attr(crime_all_2010$property_rate, "label") <- NULL
attr(crime_all_2010$log_viol, "label") <- NULL
attr(crime_all_2010$log_property, "label") <- NULL
str(crime_all_2010)tibble [3,276 × 114] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:3276] "01001" "01003" "01005" "01007" ...
..- attr(*, "label")= chr "Five-digit FIPS county code"
..- attr(*, "format.stata")= chr "%9s"
$ year : num [1:3276] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
..- attr(*, "label")= chr "Year that the offenses occurred"
..- attr(*, "format.stata")= chr "%4.0f"
$ viol : num [1:3276] 143 385 25 59 24 25 66 511 107 84 ...
$ property : num [1:3276] 1521 4161 455 408 305 ...
$ cpopcrim : num [1:3276] 54571 182265 27178 22915 57322 ...
$ viol_rate : num [1:3276] 262 211.2 92 257.5 41.9 ...
$ property_rate : num [1:3276] 2787 2283 1674 1780 532 ...
$ log_viol : num [1:3276] 5.57 5.35 4.52 5.55 3.73 ...
$ log_property : num [1:3276] 7.93 7.73 7.42 7.48 6.28 ...
$ popden08_12 : num [1:3276] 9634 12432 1266 184 834 ...
$ phispanic08_12 : num [1:3276] 0.2796 1.2298 0.4183 0.0722 0.7467 ...
$ pnhwhite08_12 : num [1:3276] 8.92 26.03 4.41 3.13 7.97 ...
$ pnhblack08_12 : num [1:3276] 2.5108 2.876 3.9907 0.7032 0.0983 ...
$ pfborn08_12 : num [1:3276] 0.1699 1.046 0.2123 0.0524 0.4356 ...
$ ped1_08_12 : num [1:3276] 2.038 3.868 2.191 0.921 2.275 ...
$ ped2_08_12 : num [1:3276] 7.63 18.93 5.37 2.75 5.65 ...
$ ped3_08_12 : num [1:3276] 2.334 8.205 1.436 0.328 1.077 ...
$ pin1b_08_12 : num [1:3276] 0.761 2.018 1.458 0.361 0.751 ...
$ pin2b_08_12 : num [1:3276] 1.368 4.478 1.54 0.858 1.448 ...
$ pin3b_08_12 : num [1:3276] 2.26 6.33 1.96 1.04 1.85 ...
$ pin4b_08_12 : num [1:3276] 4.6 11.07 2.85 1.27 3.48 ...
$ pin5b_08_12 : num [1:3276] 3.006 7.102 1.189 0.465 1.466 ...
$ pincgt75k08_12 : num [1:3276] 4.914 11.743 2.036 0.948 2.64 ...
$ pnvmar08_12 : num [1:3276] 3.21 6.6 2.78 1.01 1.77 ...
$ p18yr_08_12 : num [1:3276] 3.057 6.79 2.054 0.901 2.189 ...
$ p18_2908_12 : num [1:3276] 1.765 4.127 1.409 0.601 1.233 ...
$ p30_3908_12 : num [1:3276] 1.467 3.475 1.242 0.505 1.164 ...
$ p40_4908_12 : num [1:3276] 1.88 4.303 1.14 0.656 1.268 ...
$ p50_6908_12 : num [1:3276] 2.83 8.6 2.23 0.94 2.27 ...
$ pge7008_12 : num [1:3276] 0.999 3.711 0.921 0.398 0.874 ...
$ punemp08_12 : num [1:3276] 0.672 1.563 0.554 0.228 0.524 ...
$ pprof08_12 : num [1:3276] 3.61 9.605 2.568 0.782 2.422 ...
$ ppov08_12 : num [1:3276] 1.508 4.143 2.287 0.635 1.353 ...
$ ppubas08_12 : num [1:3276] 1.47 2.634 2.031 0.529 1.136 ...
$ pfhfam08_12 : num [1:3276] 1.35 2.642 1.392 0.319 0.588 ...
$ pownoc08_12 : num [1:3276] 9.4 23.3 6.14 3.34 7.32 ...
$ disadvantage08_12 : num [1:3276] 1.502 2.772 2.051 0.483 0.74 ...
$ disadvantage2_08_12 : num [1:3276] 1.25 2.745 1.566 0.428 0.9 ...
$ affluence08_12 : num [1:3276] 3.619 9.851 2.013 0.686 2.046 ...
$ ethnicimmigrant08_12 : num [1:3276] 0.2247 1.1379 0.3153 0.0623 0.5912 ...
$ all_school : num [1:3276] 18 55 14 14 24 6 8 47 18 8 ...
$ elem : num [1:3276] 11 37 9 7 16 4 5 26 11 5 ...
$ middle : num [1:3276] 7 18 7 5 13 2 4 20 8 6 ...
$ high : num [1:3276] 9 17 7 6 12 3 5 22 8 6 ...
$ public_school : num [1:3276] 15 46 11 11 20 5 7 41 14 8 ...
$ public_trad : num [1:3276] 15 46 11 11 20 5 7 41 14 8 ...
$ public_charter : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ pub_elem : num [1:3276] 8 28 6 5 12 3 4 20 7 5 ...
$ pub_middle : num [1:3276] 6 11 4 3 10 1 3 15 4 6 ...
$ pub_high : num [1:3276] 8 10 4 3 10 2 4 17 4 6 ...
$ public_trad_elem : num [1:3276] 8 28 6 5 12 3 4 20 7 5 ...
$ public_trad_middle : num [1:3276] 6 11 4 3 10 1 3 15 4 6 ...
$ public_trad_high : num [1:3276] 8 10 4 3 10 2 4 17 4 6 ...
$ public_charter_elem : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_charter_middle: num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_charter_high : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet_elem : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet_middle : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet_high : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ private_school : num [1:3276] 3 9 3 3 4 1 1 6 4 0 ...
$ private_elem : num [1:3276] 3 9 3 2 4 1 1 6 4 0 ...
$ private_middle : num [1:3276] 1 7 3 2 3 1 1 5 4 0 ...
$ private_high : num [1:3276] 1 7 3 3 2 1 1 5 4 0 ...
$ n_streets : num [1:3276] 8109 36296 7293 8226 13360 ...
$ sum_strintlen : num [1:3276] 2276795 7904534 2524658 2251633 3173244 ...
$ n_nodes : num [1:3276] 7160 31321 6249 7343 12267 ...
$ n_realnodes : num [1:3276] 3116 15005 3199 3450 5231 ...
$ n_blocks : num [1:3276] 1887 8587 1820 1777 2750 ...
$ tract_area_sqmiles : num [1:3276] 604 1712 905 626 651 ...
$ linknoderatio : num [1:3276] 13.96 36.35 10.68 4.47 9.8 ...
$ intdensity : num [1:3276] 384.3 733.6 78.3 25.2 74.6 ...
$ strnetdensity : num [1:3276] 131227 251716 35572 14991 44563 ...
$ connoderatio : num [1:3276] 5.68 15.14 4.71 1.87 3.84 ...
$ blockdensity : num [1:3276] 190.7 428.2 49.7 13.4 38.2 ...
$ avgblocklength : num [1:3276] 2671 5966 2859 1071 2153 ...
$ medblocklength : num [1:3276] 1730 3921 1546 578 1372 ...
$ gamma : num [1:3276] 4.68 12.15 3.58 1.49 3.27 ...
$ alpha : num [1:3276] 1.001 2.701 0.855 0.236 0.407 ...
$ count_tri_facilities : num [1:3276] 4 10 5 3 3 0 3 17 4 2 ...
$ value_0 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ prop_value_0 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ value_11 : num [1:3276] 2.19e+07 2.20e+08 5.60e+07 9.77e+06 2.03e+07 ...
$ prop_value_11 : num [1:3276] 0.1213 2.2523 0.4686 0.0275 0.1046 ...
$ value_12 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ prop_value_12 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ value_21 : num [1:3276] 6.07e+07 2.10e+08 6.54e+07 6.62e+07 8.76e+07 ...
$ prop_value_21 : num [1:3276] 1.72 3.379 0.369 0.176 0.477 ...
$ value_22 : num [1:3276] 22333800 85405500 18084000 8225400 22709700 ...
$ prop_value_22 : num [1:3276] 1.3087 1.8475 0.196 0.0206 0.1229 ...
$ value_23 : num [1:3276] 6585900 31294200 5006400 2522700 5593500 ...
$ prop_value_23 : num [1:3276] 0.48646 0.80184 0.07451 0.00617 0.02949 ...
$ value_24 : num [1:3276] 1817400 8820900 1484100 525300 1422600 ...
$ prop_value_24 : num [1:3276] 0.14089 0.25293 0.03721 0.00129 0.0073 ...
$ value_31 : num [1:3276] 2483100 24774300 4538400 2162100 3594300 ...
$ prop_value_31 : num [1:3276] 0.03998 0.4632 0.01481 0.00498 0.01928 ...
$ value_41 : num [1:3276] 2.31e+08 4.88e+06 2.06e+08 3.57e+08 5.84e+08 ...
$ prop_value_41 : num [1:3276] 1.0781 0.0288 0.824 0.953 3.1266 ...
$ value_42 : num [1:3276] 2.95e+08 1.30e+09 7.33e+08 4.81e+08 1.55e+08 ...
[list output truncated]
- attr(*, "label")= chr "National Neighborhood Data Archive (NaNDA): Crimes by County, United States, 200"
# the suffix for SES indicator 2008-2012 and 2013-2017 is different, we remove the suffix so that the variable names in predictor and implementation data are the same
remove_string_2010 <-
function(varname){
sub("08_12","",varname)
}
crime_all_2010_cleaned <-
crime_all_2010|>
# eliminate missing values and negative infinity for outcome variables
filter(!is.na(log_viol) | !is.na(log_property)) |>
filter(!(is.infinite(log_viol) | is.infinite(log_property)))|>
rename_with(remove_string_2010, contains("08_12"))
# explore missing values
missing_values_2010 <-
colSums(is.na(crime_all_2010_cleaned))
missing_values_2010 <-
as.data.frame(missing_values_2010)
# look at the variables with the most missing values,
# no variable has over 50% missing values
missing_values_percent_2010<-missing_values_2010|>
mutate(missing_percent = missing_values_2010/nrow(crime_all_2010_cleaned))|>
arrange(desc(missing_percent))|>
top_n(10)Repeat the above data cleaning process for implementation data in 2016.
attr(crime_all_2016$viol, "format.stata") <- NULL
attr(crime_all_2016$property, "format.stata") <- NULL
attr(crime_all_2016$cpopcrim, "format.stata") <- NULL
attr(crime_all_2016$viol_rate, "format.stata") <- NULL
attr(crime_all_2016$property_rate, "format.stata") <- NULL
attr(crime_all_2016$log_viol, "format.stata") <- NULL
attr(crime_all_2016$log_property, "format.stata") <- NULL
attr(crime_all_2016$viol, "label") <- NULL
attr(crime_all_2016$property, "label") <- NULL
attr(crime_all_2016$cpopcrim, "label") <- NULL
attr(crime_all_2016$viol_rate, "label") <- NULL
attr(crime_all_2016$property_rate, "label") <- NULL
attr(crime_all_2016$log_viol, "label") <- NULL
attr(crime_all_2016$log_property, "label") <- NULL
str(crime_all_2016)tibble [3,276 × 114] (S3: tbl_df/tbl/data.frame)
$ county : chr [1:3276] "01001" "01003" "01005" "01007" ...
$ cpopcrim : num [1:3276] 54499 207584 25778 22474 57565 ...
$ viol : num [1:3276] 153 443 114 16 433 ...
$ property : num [1:3276] 1722 4057 664 206 1228 ...
$ viol_rate : num [1:3276] 280.7 213.4 442.2 71.2 752.2 ...
$ property_rate : num [1:3276] 3160 1954 2576 917 2133 ...
$ log_viol : num [1:3276] 5.64 5.36 6.09 4.27 6.62 ...
$ log_property : num [1:3276] 8.06 7.58 7.85 6.82 7.67 ...
$ popden13_17 : num [1:3276] 9675 13924 1199 181 838 ...
$ phispanic13_17 : num [1:3276] 0.3698 1.2415 0.3662 0.0892 0.8895 ...
$ pnhwhite13_17 : num [1:3276] 8.68 25.81 4.14 3.18 7.8 ...
$ pnhblack13_17 : num [1:3276] 2.593 3.105 4.306 0.701 0.12 ...
$ pfborn13_17 : num [1:3276] 0.2289 0.8958 0.2211 0.0443 0.4392 ...
$ ped1_13_17 : num [1:3276] 1.697 3.31 2.342 0.629 1.894 ...
$ ped2_13_17 : num [1:3276] 7.54 18.79 5.48 2.77 5.94 ...
$ ped3_13_17 : num [1:3276] 2.768 8.895 1.178 0.602 1.169 ...
$ pin1b_13_17 : num [1:3276] 0.999 2.132 1.437 0.314 0.67 ...
$ pin2b_13_17 : num [1:3276] 1.439 3.141 1.739 0.533 1.307 ...
$ pin3b_13_17 : num [1:3276] 2.114 5.733 2.196 0.983 1.97 ...
$ pin4b_13_17 : num [1:3276] 4.26 11.41 2.45 1.43 3.49 ...
$ pin5b_13_17 : num [1:3276] 3.189 8.584 1.181 0.741 1.562 ...
$ pincgt75k13_17 : num [1:3276] 5.04 13.3 2.32 1.28 2.72 ...
$ pnvmar13_17 : num [1:3276] 3.16 7.67 3.06 1.12 1.84 ...
$ p18yr_13_17 : num [1:3276] 2.872 6.629 1.986 0.882 2.156 ...
$ p18_2913_17 : num [1:3276] 1.802 3.932 1.436 0.628 1.232 ...
$ p30_3913_17 : num [1:3276] 1.557 3.43 1.119 0.475 1.018 ...
$ p40_4913_17 : num [1:3276] 1.637 3.864 1.06 0.574 1.263 ...
$ p50_6913_17 : num [1:3276] 2.91 9 2.27 1.03 2.32 ...
$ pge7013_17 : num [1:3276] 1.218 4.148 1.125 0.414 1.016 ...
$ punemp13_17 : num [1:3276] 0.429 1.075 0.537 0.181 0.207 ...
$ pprof13_17 : num [1:3276] 3.9 10.37 2.24 1.02 2.5 ...
$ ppov13_17 : num [1:3276] 1.746 3.991 2.498 0.558 1.479 ...
$ ppubas13_17 : num [1:3276] 1.816 2.809 2.377 0.526 0.994 ...
$ pfhfam13_17 : num [1:3276] 1.234 2.044 1.555 0.317 0.634 ...
$ pownoc13_17 : num [1:3276] 8.9 22.84 5.63 3.09 7.1 ...
$ disadvantage13_17 : num [1:3276] 1.564 2.605 2.255 0.456 0.687 ...
$ disadvantage2_13_17 : num [1:3276] 1.306 2.48 1.742 0.395 0.828 ...
$ affluence13_17 : num [1:3276] 3.901 10.854 1.913 0.969 2.128 ...
$ ethnicimmigrant13_17 : num [1:3276] 0.2994 1.0686 0.2937 0.0667 0.6644 ...
$ all_school : num [1:3276] 18 52 10 11 22 5 8 42 17 8 ...
$ elem : num [1:3276] 10 35 5 6 13 2 5 21 8 5 ...
$ middle : num [1:3276] 6 17 5 3 12 2 4 17 7 6 ...
$ high : num [1:3276] 8 16 4 4 13 3 5 18 7 6 ...
$ public_school : num [1:3276] 16 45 9 10 20 4 7 39 14 8 ...
$ public_trad : num [1:3276] 16 45 9 10 20 4 6 39 14 8 ...
$ public_charter : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
$ pub_elem : num [1:3276] 8 28 4 5 11 1 4 18 7 5 ...
$ pub_middle : num [1:3276] 6 10 4 2 10 1 3 14 4 6 ...
$ pub_high : num [1:3276] 8 9 3 3 11 2 4 15 4 6 ...
$ public_trad_elem : num [1:3276] 8 28 4 5 11 1 3 18 7 5 ...
$ public_trad_middle : num [1:3276] 6 10 4 2 10 1 2 14 4 6 ...
$ public_trad_high : num [1:3276] 8 9 3 3 11 2 3 15 4 6 ...
$ public_charter_elem : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_charter_middle: num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_charter_high : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ public_magnet_elem : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
$ public_magnet_middle : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
$ public_magnet_high : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
$ private_school : num [1:3276] 2 7 1 1 2 1 1 3 3 0 ...
$ private_elem : num [1:3276] 2 7 1 1 2 1 1 3 1 0 ...
$ private_middle : num [1:3276] 0 7 1 1 2 1 1 3 3 0 ...
$ private_high : num [1:3276] 0 7 1 1 2 1 1 3 3 0 ...
$ n_streets : num [1:3276] 7077 33577 6519 5738 12773 ...
$ sum_strintlen : num [1:3276] 2113191 7669943 2430786 1876016 3165251 ...
$ n_nodes : num [1:3276] 6159 29072 5528 5129 11740 ...
$ n_realnodes : num [1:3276] 2720 13556 2889 2009 4736 ...
$ n_blocks : num [1:3276] 1512 6317 1283 1090 2404 ...
$ tract_area_sqmiles : num [1:3276] 604 1712 905 626 651 ...
$ linknoderatio : num [1:3276] 19.78 50.21 10.77 8.94 17.45 ...
$ intdensity : num [1:3276] 525.2 995.9 76.6 44.4 136.1 ...
$ strnetdensity : num [1:3276] 187018 366850 35389 30241 82999 ...
$ connoderatio : num [1:3276] 8.07 20.43 4.82 3.13 6.52 ...
$ blockdensity : num [1:3276] 227.3 458.1 35.6 23.8 71 ...
$ avgblocklength : num [1:3276] 4179 8417 3076 2589 3965 ...
$ medblocklength : num [1:3276] 2660 5603 1665 1455 2497 ...
$ gamma : num [1:3276] 6.64 16.8 3.61 2.99 5.84 ...
$ alpha : num [1:3276] 1.437 3.663 0.902 0.477 0.742 ...
$ count_tri_facilities : num [1:3276] 5 11 6 3 2 1 1 22 5 2 ...
$ value_0 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ prop_value_0 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ value_11 : num [1:3276] 2.19e+07 2.20e+08 5.60e+07 9.77e+06 2.03e+07 ...
$ prop_value_11 : num [1:3276] 0.1213 2.2523 0.4686 0.0275 0.1046 ...
$ value_12 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ prop_value_12 : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
$ value_21 : num [1:3276] 6.07e+07 2.10e+08 6.54e+07 6.62e+07 8.76e+07 ...
$ prop_value_21 : num [1:3276] 1.72 3.379 0.369 0.176 0.477 ...
$ value_22 : num [1:3276] 22333800 85405500 18084000 8225400 22709700 ...
$ prop_value_22 : num [1:3276] 1.3087 1.8475 0.196 0.0206 0.1229 ...
$ value_23 : num [1:3276] 6585900 31294200 5006400 2522700 5593500 ...
$ prop_value_23 : num [1:3276] 0.48646 0.80184 0.07451 0.00617 0.02949 ...
$ value_24 : num [1:3276] 1817400 8820900 1484100 525300 1422600 ...
$ prop_value_24 : num [1:3276] 0.14089 0.25293 0.03721 0.00129 0.0073 ...
$ value_31 : num [1:3276] 2483100 24774300 4538400 2162100 3594300 ...
$ prop_value_31 : num [1:3276] 0.03998 0.4632 0.01481 0.00498 0.01928 ...
$ value_41 : num [1:3276] 2.31e+08 4.88e+06 2.06e+08 3.57e+08 5.84e+08 ...
$ prop_value_41 : num [1:3276] 1.0781 0.0288 0.824 0.953 3.1266 ...
$ value_42 : num [1:3276] 2.95e+08 1.30e+09 7.33e+08 4.81e+08 1.55e+08 ...
$ prop_value_42 : num [1:3276] 1.244 6.838 2.748 1.207 0.862 ...
[list output truncated]
- attr(*, "label")= chr "Uniform Crime Reporting Program Data: County-Level Detailed Arrest and Offense D"
remove_string_2016 <-
function(varname){
sub("13_17","",varname)
}
crime_all_2016_cleaned <-
crime_all_2016|>
# eliminate missing values for outcome variables
filter(!is.na(log_viol) | !is.na(log_property)) |>
filter(!(is.infinite(log_viol) | is.infinite(log_property)))|>
rename_with(remove_string_2016, contains("13_17"))
# explore missing values
missing_values_2016 <-
colSums(is.na(crime_all_2016_cleaned))
missing_values_2016 <-
as.data.frame(missing_values_2016)
# look at the variables with the most missing values,
# no variable has over 50% missing values
missing_values_percent_2016 <-
missing_values_2016|>
mutate(missing_percent = missing_values_2016/nrow(crime_all_2010_cleaned))|>
arrange(desc(missing_percent))|>
top_n(10)Before starting the machine learning process, we used package plotly to map the average violent and property crime rates across the U.S. from 2002 to 2014, displayed on a state-by-state basis for better understanding.
We generate crime rate as the number of crimes per 100,000 people.
\[ Rate = \left( \frac{Crime}{Population} \right)10 ^ 5 \tag{1}\]
crime_02_14 <-
crime_predictive|>
rename_all(tolower)|>
rename(county=stcofips)|>
mutate(viol = murder+rape+robbery+agasslt)|>
mutate(property = burglry+larceny+mvtheft)|>
select(county, year, viol, property, cpopcrim) |>
mutate(viol_rate = (viol / cpopcrim) * 100000) |>
mutate(property_rate = (property / cpopcrim) * 100000) |>
filter(!is.na(viol_rate) | !is.na(property_rate)) |>
filter(!(is.infinite(viol_rate) | is.infinite(property_rate)))We then converted county units to states based on the five-digit FIPS code to more visually compare differences between regions.
crime_02_14$state_fips <-
substr(crime_02_14$county,
1, 2)
state_fips_to_abb <-
tibble(
state_fips = c("01", "02", "04", "05", "06",
"08", "09", "10", "11", "12",
"13", "15", "16", "17", "18",
"19", "20", "21", "22", "23",
"24", "25", "26", "27", "28",
"29", "30", "31", "32", "33",
"34", "35", "36", "37", "38",
"39", "40", "41", "42", "44",
"45", "46", "47", "48", "49",
"50", "51", "53", "54", "55", "56"),
state_abb = c("AL", "AK", "AZ", "AR", "CA", "CO",
"CT", "DE", "DC", "FL", "GA", "HI",
"ID", "IL", "IN", "IA", "KS", "KY",
"LA", "ME", "MD", "MA", "MI", "MN",
"MS", "MO", "MT", "NE", "NV", "NH",
"NJ", "NM", "NY", "NC", "ND", "OH",
"OK", "OR", "PA", "RI", "SC", "SD",
"TN", "TX", "UT", "VT", "VA", "WA",
"WV", "WI", "WY")
)Finally, we chose the choropleth map to illustrate the average violent crime and property crime rate in the U.S., 2002 - 2014. To keep the visualization informative, we used plotly package to add hover text and so on for interactivity. There are also other interactive settings for readers, like zoom in and out for details.
# convert county-level data to state-level data, aggregate violent crime data
crime_data_map <- crime_02_14 |>
group_by(state_fips) |>
summarise(
avg_viol_rate = mean(viol_rate, na.rm = TRUE),
avg_property_rate = mean(property_rate, na.rm = TRUE)
) |>
left_join(state_fips_to_abb,
by = 'state_fips') |>
mutate(
hover_violence = paste(state_abb,
'<br>',
"Violent Crimes Rate: ",
round(avg_viol_rate, 2)),
hover_property = paste(state_abb,
'<br>',
"Property Crimes Rate: ",
round(avg_property_rate, 2))
) |>
pivot_longer(
cols = c(avg_viol_rate,
avg_property_rate),
names_to = "crime_type",
values_to = "crime_count"
)# create and output the map for violent crime
violence_map <-
plot_geo(crime_data_map |>
filter(crime_type == "avg_viol_rate"),
locationmode = 'USA-states') |>
add_trace(
z = ~crime_count,
text = ~hover_violence,
hoverinfo = 'text',
locations = ~state_abb,
color = ~crime_count,
zmin = 0,
zmax = 1000,
colors = c("#1a9641", "#ffffbf", "#fdae61", "#d7191c"),
colorbar = list(title = "Violent Crimes Rate",
tickvals = c(250, 500, 750))
) |>
layout(
title = 'Average US Crimes Rate by State, 2002 - 2014',
geo = list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
)# create and output the map for property crime rate
property_map <-
plot_geo(crime_data_map |>
filter(crime_type == "avg_property_rate"),
locationmode = 'USA-states') |>
add_trace(
z = ~crime_count,
text = ~hover_property,
hoverinfo = 'text',
locations = ~state_abb,
color = ~crime_count,
zmin = 0,
zmax = 5000,
colors = c("#2b83ba", "#ffffbf", "#fdae61", "#d7191c"),
colorbar = list(title = "Property Crimes Rate",
tickvals = c(1000, 2000, 3000, 4000))
) |>
layout(
title = 'Avergae US Crimes Rate by State, 2002 - 2014',
geo = list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
)fig <-
plotly::subplot(violence_map,
property_map,
nrows = 2,
margin = 0.05)
# print the figure
figAreas with high crime rates are indicated by a darker red color. It can be seen that crime rate is higher in the south and near the border. There are also some interesting findings, such as the fact that Texas, which is usually considered to have a high rate of gun ownership, boasts a violent crime rate of less than 300, which could be influenced by population.
set.seed(904)
crime_split <-
initial_split(crime_all_2010_cleaned, prop = 0.8)
crime_train <-
training(crime_split)
crime_test <-
testing(crime_split)# set up 5-fold cross validation
set.seed(904)
crime_folds <-
vfold_cv(data = crime_train, v = 5)# create a recipe
ridge_v_rec <-
recipe(log_viol ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_property,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())|>
step_normalize(all_predictors())
# check if feature engineering works to remove variables
ridge_v_summary <-
summary(ridge_v_rec)
# model
ridge_v_mod <-
linear_reg(penalty = tune(), mixture = 0) |>
set_mode(mode = "regression") |>
set_engine(engine = "glmnet")
ridge_v_grid <-
grid_regular(penalty(),
levels = 20)
# workflow
ridge_v_wf <-
workflow() |>
add_recipe(recipe = ridge_v_rec) |>
add_model(spec = ridge_v_mod)ridge_v_resamples <-
ridge_v_wf |>
tune_grid(
resamples = crime_folds,
grid = ridge_v_grid,
metrics = metric_set(rmse)
)
ridge_v_resamples |>
collect_metrics()# A tibble: 20 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 e-10 rmse standard 0.776 5 0.0150 Preprocessor1_Model01
2 3.36e-10 rmse standard 0.776 5 0.0150 Preprocessor1_Model02
3 1.13e- 9 rmse standard 0.776 5 0.0150 Preprocessor1_Model03
4 3.79e- 9 rmse standard 0.776 5 0.0150 Preprocessor1_Model04
5 1.27e- 8 rmse standard 0.776 5 0.0150 Preprocessor1_Model05
6 4.28e- 8 rmse standard 0.776 5 0.0150 Preprocessor1_Model06
7 1.44e- 7 rmse standard 0.776 5 0.0150 Preprocessor1_Model07
8 4.83e- 7 rmse standard 0.776 5 0.0150 Preprocessor1_Model08
9 1.62e- 6 rmse standard 0.776 5 0.0150 Preprocessor1_Model09
10 5.46e- 6 rmse standard 0.776 5 0.0150 Preprocessor1_Model10
11 1.83e- 5 rmse standard 0.776 5 0.0150 Preprocessor1_Model11
12 6.16e- 5 rmse standard 0.776 5 0.0150 Preprocessor1_Model12
13 2.07e- 4 rmse standard 0.776 5 0.0150 Preprocessor1_Model13
14 6.95e- 4 rmse standard 0.776 5 0.0150 Preprocessor1_Model14
15 2.34e- 3 rmse standard 0.776 5 0.0150 Preprocessor1_Model15
16 7.85e- 3 rmse standard 0.776 5 0.0150 Preprocessor1_Model16
17 2.64e- 2 rmse standard 0.775 5 0.0148 Preprocessor1_Model17
18 8.86e- 2 rmse standard 0.768 5 0.0152 Preprocessor1_Model18
19 2.98e- 1 rmse standard 0.767 5 0.0164 Preprocessor1_Model19
20 1 e+ 0 rmse standard 0.776 5 0.0172 Preprocessor1_Model20
top_ridge_v_models <-
ridge_v_resamples |>
show_best() |>
arrange(penalty)
ridge_v_resamples|>
select_best()# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.298 Preprocessor1_Model19
ridge_v_resamples|>
autoplot()set.seed(904)
ridge_v_pre_wf <-
ridge_v_wf |>
finalize_workflow(select_best(ridge_v_resamples))
ridge_v_pre_wf <-
fit(ridge_v_pre_wf,
data = crime_train)
ridge_v_predictions <-
predict(object = ridge_v_pre_wf,
new_data = crime_test)$.pred
actual <-
crime_test$log_viol
county <-
crime_test$county
ridge_v_predictions_df <-
tibble(county,actual,ridge_v_predictions)
rmse_ridge_v<-rmse(ridge_v_predictions_df, actual, ridge_v_predictions)lasso_v_rec <-
recipe(log_viol ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_property,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())|>
step_normalize(all_predictors())
# check if feature engineering works to remove variables
lasso_v_summary <-
summary(lasso_v_rec)
# create a model
lasso_v_mod <-
linear_reg(penalty = tune(), mixture = 1) |>
set_mode(mode = "regression") |>
set_engine(engine = "glmnet")
lasso_v_grid <-
grid_regular(penalty(),
levels = 20)
lasso_v_wf <-
workflow() |>
add_recipe(recipe = lasso_v_rec) |>
add_model(spec = lasso_v_mod)set.seed(904)
lasso_v_resamples <-
lasso_v_wf|>
tune_grid(
resamples = crime_folds,
grid = lasso_v_grid,
metrics = metric_set(rmse)
)
lasso_v_resamples |>
collect_metrics()# A tibble: 20 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 e-10 rmse standard 0.861 5 0.0470 Preprocessor1_Model01
2 3.36e-10 rmse standard 0.861 5 0.0470 Preprocessor1_Model02
3 1.13e- 9 rmse standard 0.861 5 0.0470 Preprocessor1_Model03
4 3.79e- 9 rmse standard 0.861 5 0.0470 Preprocessor1_Model04
5 1.27e- 8 rmse standard 0.861 5 0.0470 Preprocessor1_Model05
6 4.28e- 8 rmse standard 0.861 5 0.0470 Preprocessor1_Model06
7 1.44e- 7 rmse standard 0.861 5 0.0470 Preprocessor1_Model07
8 4.83e- 7 rmse standard 0.861 5 0.0470 Preprocessor1_Model08
9 1.62e- 6 rmse standard 0.861 5 0.0470 Preprocessor1_Model09
10 5.46e- 6 rmse standard 0.861 5 0.0470 Preprocessor1_Model10
11 1.83e- 5 rmse standard 0.861 5 0.0470 Preprocessor1_Model11
12 6.16e- 5 rmse standard 0.858 5 0.0471 Preprocessor1_Model12
13 2.07e- 4 rmse standard 0.829 5 0.0306 Preprocessor1_Model13
14 6.95e- 4 rmse standard 0.790 5 0.0190 Preprocessor1_Model14
15 2.34e- 3 rmse standard 0.772 5 0.0139 Preprocessor1_Model15
16 7.85e- 3 rmse standard 0.766 5 0.0159 Preprocessor1_Model16
17 2.64e- 2 rmse standard 0.776 5 0.0159 Preprocessor1_Model17
18 8.86e- 2 rmse standard 0.793 5 0.0166 Preprocessor1_Model18
19 2.98e- 1 rmse standard 0.827 5 0.0166 Preprocessor1_Model19
20 1 e+ 0 rmse standard 0.827 5 0.0166 Preprocessor1_Model20
top_lasso_v_models <-
lasso_v_resamples |>
show_best() |>
arrange(penalty)
lasso_v_resamples|>
select_best()# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00785 Preprocessor1_Model16
lasso_v_resamples|>
autoplot()set.seed(904)
lasso_v_pre_wf <-
lasso_v_wf |>
finalize_workflow(select_best(lasso_v_resamples))
lasso_v_pre_wf <-
fit(lasso_v_pre_wf,
data = crime_train)
lasso_v_predictions <-
predict(object = lasso_v_pre_wf,
new_data = crime_test)$.pred
actual <- crime_test$log_viol
county <- crime_test$county
lasso_v_predictions_df <-
tibble(county,actual,lasso_v_predictions)
rmse_lasso_v <-
rmse(lasso_v_predictions_df, actual, lasso_v_predictions)rf_v_rec <-
recipe(log_viol ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_property,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
rf_v_mod <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 200
) |>
set_mode(mode = "regression") |>
set_engine(
engine = "ranger",
importance = "impurity",
num.threads = 4
)
rf_v_wf <-
workflow() |>
add_recipe(rf_v_rec) |>
add_model(rf_v_mod)
rf_v_grid <- grid_regular(
mtry(range = c(1, 15)),
min_n(range = c(1, 15)),
levels = 5
)set.seed(904)
rf_v_resamples <-
tune_grid(
rf_v_wf,
resamples = crime_folds,
grid = rf_v_grid)
collect_metrics(rf_v_resamples)# A tibble: 50 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 1 rmse standard 0.680 5 0.0144 Preprocessor1_Model01
2 1 1 rsq standard 0.341 5 0.0198 Preprocessor1_Model01
3 4 1 rmse standard 0.654 5 0.0133 Preprocessor1_Model02
4 4 1 rsq standard 0.383 5 0.0184 Preprocessor1_Model02
5 8 1 rmse standard 0.647 5 0.0131 Preprocessor1_Model03
6 8 1 rsq standard 0.393 5 0.0199 Preprocessor1_Model03
7 11 1 rmse standard 0.643 5 0.0132 Preprocessor1_Model04
8 11 1 rsq standard 0.401 5 0.0194 Preprocessor1_Model04
9 15 1 rmse standard 0.643 5 0.0125 Preprocessor1_Model05
10 15 1 rsq standard 0.399 5 0.0189 Preprocessor1_Model05
# ℹ 40 more rows
show_best(rf_v_resamples,
metric = "rmse") # A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 15 8 rmse standard 0.643 5 0.0131 Preprocessor1_Model15
2 11 1 rmse standard 0.643 5 0.0132 Preprocessor1_Model04
3 15 1 rmse standard 0.643 5 0.0125 Preprocessor1_Model05
4 15 11 rmse standard 0.644 5 0.0131 Preprocessor1_Model20
5 11 4 rmse standard 0.645 5 0.0139 Preprocessor1_Model09
select_best(rf_v_resamples)# A tibble: 1 × 3
mtry min_n .config
<int> <int> <chr>
1 15 8 Preprocessor1_Model15
autoplot(rf_v_resamples)set.seed(904)
rf_v_pre_wf <-
rf_v_wf |>
finalize_workflow(select_best(rf_v_resamples))
rf_v_pre_wf<-
fit(rf_v_pre_wf,
data = crime_train)
rf_v_predictions <-
predict(object = rf_v_pre_wf,
new_data = crime_test)$.pred
actual <-
crime_test$log_viol
county <-
crime_test$county
rf_v_predictions_df<-
tibble(county,actual,rf_v_predictions)
rmse_rf_v <-
rmse(rf_v_predictions_df, actual, rf_v_predictions)set.seed(904)
rf_v_final_wf <-
rf_v_wf |>
finalize_workflow(select_best(rf_v_resamples))
rf_v_fit <-
rf_v_final_wf |>
fit(data = crime_all_2010_cleaned)
crime_labels <- c(
pnhblack = "Proportion of people non-Hispanic Black",
pin1b_ = "Proportion of families with Income less than 15K",
disadvantage = "Disadvantage1",
ppubas = "Prop. of households with public assistance income",
ppov = "Prop. ppl w/ income past 12 months below poverty level",
value_52 = "Road covered by Shrub/Scrub (sqm)",
pfhfam = "Proportion female-headed families w/ kids",
disadvantage2_ = "Disadvantage2",
prop_value_41 = "Prop. road covered by deciduous Forest (sqm)",
pnhwhite = "Proportion of people non-Hispanic White",
ped1_ = "Prop. w/ less than high school diploma",
prop_value_52 = "Prop. road covered by Shrub/Scrub (sqm)",
phispanic = "Proportion of people of Hispanic origin",
pin2b_ = "Proportion of families with Income 15-30K",
pnwhite = "Proportion of people non-Hispanic White",
value_41 = "Road covered by deciduous Forest (sqm)",
punemp = "Prop. 16+ civ labor force unemployed",
prop_value_21 = "Prop. Developed, Open Space (sqm)",
value_24 = "Road Developed, High Intensity (sqm)",
prop_value_24 = "Prop road Developed, High Intensity (sqm)",
prop_value_23 = "Prop. Road Developed, Medium Intensity (sqm)",
prop_value_22 = "Prop. Road Developed, Low Intensity (sqm)",
p18_29 = "Proportion of population 18-29 years of age",
popden = "Persons per square mile"
)
rf_v_fit|>
extract_fit_parsnip()|>
vip(num_features = 15) %>%
.$data |>
mutate(
Importance = Importance / max(Importance),
Variable = fct_reorder(.x = Importance, .f = Variable)
) |>
ggplot(aes(Importance, Variable)) +
scale_y_discrete(labels = crime_labels) +
labs(title = "Top 15 Variables for Violent Crime Prediction") +
geom_col()Based on the result from random forest model, the top 15 important variables for predicting violent crime are above. Most of the variables are from the socioeconomic data and few are from the land cover data.
Disadvantage1 is the mean of proportion of people Non-Hispanic Black, proportion female-headed families with kids, proportion of households with public assistance income, proportion people with income past 12 months below poverty level, and proportion 16+ civilian labor force unemployed.
Disadvantage2 is the mean of proportion female-headed families with kids, proportion of households with public assistance income, proportion people with income past 12 months below poverty level, and proportion 16+ civilian labor force unemployed.
# recipe
mars_v_rec <-
recipe(log_viol ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_property,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
mars_v_rec_sum <-
summary(mars_v_rec)
# model
mars_v_mod <-
mars(
num_terms = tune(),
prod_degree = tune()
) |>
set_mode(mode = "regression") |>
set_engine(engine = "earth")
# tuning grid
mars_v_grid <-
grid_regular(
num_terms(range = c(10, 100)),
prod_degree(),
levels = 10
)
# workflow
mars_v_wf <-
workflow() |>
add_recipe(recipe = mars_v_rec) |>
add_model(spec = mars_v_mod)# resample
set.seed(904)
mars_v_resamples <-
mars_v_wf|>
tune_grid(
resamples = crime_folds,
grid = mars_v_grid,
metrics = metric_set(rmse)
)
collect_metrics(mars_v_resamples)# A tibble: 20 × 8
num_terms prod_degree .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 10 1 rmse standard 0.679 5 0.0107 Preprocessor1_M…
2 20 1 rmse standard 0.668 5 0.0123 Preprocessor1_M…
3 30 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
4 40 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
5 50 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
6 60 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
7 70 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
8 80 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
9 90 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
10 100 1 rmse standard 0.668 5 0.0121 Preprocessor1_M…
11 10 2 rmse standard 0.674 5 0.00990 Preprocessor1_M…
12 20 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
13 30 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
14 40 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
15 50 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
16 60 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
17 70 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
18 80 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
19 90 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
20 100 2 rmse standard 0.663 5 0.00751 Preprocessor1_M…
top_mars_v_models <-
mars_v_resamples |>
show_best()
mars_v_resamples|>
select_best()# A tibble: 1 × 3
num_terms prod_degree .config
<int> <int> <chr>
1 20 2 Preprocessor1_Model12
set.seed(904)
mars_v_pre_wf <-
mars_v_wf |>
finalize_workflow(select_best(mars_v_resamples))
mars_v_pre_wf<-
fit(mars_v_pre_wf,
data = crime_train)
mars_v_predictions <-
predict(object = mars_v_pre_wf,
new_data = crime_test)$.pred
actual<-crime_test$log_viol
county<-crime_test$county
mars_v_predictions_df<-
tibble(county,actual,mars_v_predictions)
rmse_mars_v<-rmse(mars_v_predictions_df, actual, mars_v_predictions)xgb_v_rec <-
recipe(log_viol ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_property,
new_role = "id")|>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
all_cores <-
parallel::detectCores(logical = FALSE)
registerDoFuture()
cl <-
makeCluster(all_cores - 3L)
plan(cluster, workers = cl)
xgb_v_spec <-
boost_tree(
trees = 200,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) |>
set_engine("xgboost") |>
set_mode("regression")
set.seed(904)
xgb_v_grid <-
grid_latin_hypercube(
tree_depth(),
min_n(range = c(2, 10)),
loss_reduction(range = c(-5, -3)),
sample_size = sample_prop(),
mtry(range = c(1, 10)),
learn_rate(range = c(-5, -1)),
size = 30
)
xgb_v_wf <- workflow() |>
add_recipe(xgb_v_rec) |>
add_model(xgb_v_spec)set.seed(904)
xgb_v_resamples <-
tune_grid(
xgb_v_wf,
resamples = crime_folds,
grid = xgb_v_grid)
collect_metrics(xgb_v_resamples)# A tibble: 60 × 12
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 4 8 2 0.000393 0.000173 0.675 rmse
2 4 8 2 0.000393 0.000173 0.675 rsq
3 7 2 4 0.000566 0.000970 0.829 rmse
4 7 2 4 0.000566 0.000970 0.829 rsq
5 4 7 9 0.0756 0.0000200 0.111 rmse
6 4 7 9 0.0756 0.0000200 0.111 rsq
7 7 3 14 0.00179 0.000214 0.413 rmse
8 7 3 14 0.00179 0.000214 0.413 rsq
9 5 8 6 0.0000187 0.0000519 0.195 rmse
10 5 8 6 0.0000187 0.0000519 0.195 rsq
# ℹ 50 more rows
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
# .config <chr>
show_best(xgb_v_resamples, metric = "rmse") # A tibble: 5 × 12
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 6 7 6 0.0461 0.0000290 0.547 rmse
2 4 4 13 0.0262 0.0000333 0.922 rmse
3 2 6 5 0.0559 0.000137 0.998 rmse
4 2 9 8 0.0306 0.0000118 0.156 rmse
5 10 3 4 0.0167 0.0000631 0.631 rmse
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
# .config <chr>
select_best(xgb_v_resamples, metric= "rmse")# A tibble: 1 × 7
mtry min_n tree_depth learn_rate loss_reduction sample_size .config
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 6 7 6 0.0461 0.0000290 0.547 Preprocessor1_Mo…
set.seed(904)
xgb_v_pre_wf <-
xgb_v_wf |>
finalize_workflow(select_best(xgb_v_resamples))
xgb_v_pre_wf <-
fit(xgb_v_pre_wf,
data = crime_train)
xgb_v_predictions <-
predict(object = xgb_v_pre_wf,
new_data = crime_test)$.pred
actual <-
crime_test$log_viol
county <-
crime_test$county
xgb_v_predictions_df <-
tibble(county,actual,xgb_v_predictions)
rmse_xgb_v <-
rmse(xgb_v_predictions_df, actual, xgb_v_predictions)violence_comparison <-
bind_rows(rmse_ridge_v, rmse_lasso_v, rmse_rf_v, rmse_mars_v, rmse_xgb_v)
violence_comparison|>
mutate(
model = c("Ridge Regression","Lasso Regression",
"Random Forest", "MARS", "XG Boost"))|>
select(model, everything())# A tibble: 5 × 4
model .metric .estimator .estimate
<chr> <chr> <chr> <dbl>
1 Ridge Regression rmse standard 0.822
2 Lasso Regression rmse standard 0.817
3 Random Forest rmse standard 0.686
4 MARS rmse standard 0.712
5 XG Boost rmse standard 0.674
Because XG Boost has the lowest RMSE value, we will use XG boost to predict violent crime rate in 2016.
set.seed(904)
xgb_v_pre_wf_2016 <-
xgb_v_wf |>
finalize_workflow(select_best(xgb_v_resamples))
xgb_v_pre_wf_2016 <-
fit(xgb_v_pre_wf,
data = crime_all_2010_cleaned)
xgb_v_predictions_2016 <-
predict(object = xgb_v_pre_wf_2016,
new_data = crime_all_2016_cleaned)$.pred
actual_implement_v <- crime_all_2016_cleaned$log_viol
county_implement <- crime_all_2016_cleaned$county
xgb_v_predictions_2016_df <-
tibble(county_implement,actual_implement_v,xgb_v_predictions_2016)
rmse_xgb_v_2016 <-
rmse(xgb_v_predictions_2016_df, actual_implement_v, xgb_v_predictions_2016)
rmse_xgb_v_2016# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.593
Final prediction for violent crime rate in 2016 has a RMSE of 0.593.
# create a recipe
ridge_p_rec <-
recipe(log_property ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_viol,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())|>
step_normalize(all_predictors())
# check if feature engineering works to remove variables
ridge_p_summary <-
summary(ridge_p_rec)
# model
ridge_p_mod <-
linear_reg(penalty = tune(), mixture = 0) |>
set_mode(mode = "regression") |>
set_engine(engine = "glmnet")
ridge_p_grid<-
grid_regular(penalty(),
levels = 20)
# workflow
ridge_p_wf <-
workflow() |>
add_recipe(recipe = ridge_p_rec) |>
add_model(spec = ridge_p_mod)ridge_p_resamples <-
ridge_p_wf |>
tune_grid(
resamples = crime_folds,
grid = ridge_p_grid,
metrics = metric_set(rmse)
)
ridge_p_resamples |>
collect_metrics()# A tibble: 20 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 e-10 rmse standard 0.606 5 0.0174 Preprocessor1_Model01
2 3.36e-10 rmse standard 0.606 5 0.0174 Preprocessor1_Model02
3 1.13e- 9 rmse standard 0.606 5 0.0174 Preprocessor1_Model03
4 3.79e- 9 rmse standard 0.606 5 0.0174 Preprocessor1_Model04
5 1.27e- 8 rmse standard 0.606 5 0.0174 Preprocessor1_Model05
6 4.28e- 8 rmse standard 0.606 5 0.0174 Preprocessor1_Model06
7 1.44e- 7 rmse standard 0.606 5 0.0174 Preprocessor1_Model07
8 4.83e- 7 rmse standard 0.606 5 0.0174 Preprocessor1_Model08
9 1.62e- 6 rmse standard 0.606 5 0.0174 Preprocessor1_Model09
10 5.46e- 6 rmse standard 0.606 5 0.0174 Preprocessor1_Model10
11 1.83e- 5 rmse standard 0.606 5 0.0174 Preprocessor1_Model11
12 6.16e- 5 rmse standard 0.606 5 0.0174 Preprocessor1_Model12
13 2.07e- 4 rmse standard 0.606 5 0.0174 Preprocessor1_Model13
14 6.95e- 4 rmse standard 0.606 5 0.0174 Preprocessor1_Model14
15 2.34e- 3 rmse standard 0.606 5 0.0174 Preprocessor1_Model15
16 7.85e- 3 rmse standard 0.606 5 0.0174 Preprocessor1_Model16
17 2.64e- 2 rmse standard 0.605 5 0.0178 Preprocessor1_Model17
18 8.86e- 2 rmse standard 0.602 5 0.0199 Preprocessor1_Model18
19 2.98e- 1 rmse standard 0.605 5 0.0213 Preprocessor1_Model19
20 1 e+ 0 rmse standard 0.613 5 0.0216 Preprocessor1_Model20
top_ridge_p_models <-
ridge_p_resamples |>
show_best() |>
arrange(penalty)
ridge_p_resamples|>
select_best()# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.0886 Preprocessor1_Model18
ridge_p_resamples|>
autoplot()set.seed(904)
ridge_p_pre_wf <-
ridge_p_wf |>
finalize_workflow(select_best(ridge_p_resamples))
ridge_p_pre_wf <-
fit(ridge_p_pre_wf,
data = crime_train)
ridge_p_predictions <-
predict(object = ridge_p_pre_wf,
new_data = crime_test)$.pred
actual_p <-
crime_test$log_property
county <-
crime_test$county
ridge_p_predictions_df <-
tibble(county,actual_p,ridge_p_predictions)
rmse_ridge_p <-
rmse(ridge_p_predictions_df, actual_p, ridge_p_predictions)lasso_p_rec <-
recipe(log_property ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_viol,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())|>
step_normalize(all_predictors())
# check if feature engineering works to remove variables
lasso_p_summary <-
summary(lasso_p_rec)
# create a model
lasso_p_mod <-
linear_reg(penalty = tune(), mixture = 1) |>
set_mode(mode = "regression") |>
set_engine(engine = "glmnet")
lasso_p_grid <-
grid_regular(penalty(),
levels = 20)
lasso_p_wf <-
workflow() |>
add_recipe(recipe = lasso_p_rec) |>
add_model(spec = lasso_p_mod)set.seed(904)
lasso_p_resamples <-
lasso_p_wf|>
tune_grid(
resamples = crime_folds,
grid = lasso_p_grid,
metrics = metric_set(rmse)
)
lasso_p_resamples |>
collect_metrics()# A tibble: 20 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 e-10 rmse standard 0.678 5 0.0402 Preprocessor1_Model01
2 3.36e-10 rmse standard 0.678 5 0.0402 Preprocessor1_Model02
3 1.13e- 9 rmse standard 0.678 5 0.0402 Preprocessor1_Model03
4 3.79e- 9 rmse standard 0.678 5 0.0402 Preprocessor1_Model04
5 1.27e- 8 rmse standard 0.678 5 0.0402 Preprocessor1_Model05
6 4.28e- 8 rmse standard 0.678 5 0.0402 Preprocessor1_Model06
7 1.44e- 7 rmse standard 0.678 5 0.0402 Preprocessor1_Model07
8 4.83e- 7 rmse standard 0.678 5 0.0402 Preprocessor1_Model08
9 1.62e- 6 rmse standard 0.678 5 0.0402 Preprocessor1_Model09
10 5.46e- 6 rmse standard 0.678 5 0.0402 Preprocessor1_Model10
11 1.83e- 5 rmse standard 0.678 5 0.0402 Preprocessor1_Model11
12 6.16e- 5 rmse standard 0.665 5 0.0325 Preprocessor1_Model12
13 2.07e- 4 rmse standard 0.637 5 0.0199 Preprocessor1_Model13
14 6.95e- 4 rmse standard 0.611 5 0.0174 Preprocessor1_Model14
15 2.34e- 3 rmse standard 0.607 5 0.0177 Preprocessor1_Model15
16 7.85e- 3 rmse standard 0.604 5 0.0204 Preprocessor1_Model16
17 2.64e- 2 rmse standard 0.611 5 0.0212 Preprocessor1_Model17
18 8.86e- 2 rmse standard 0.630 5 0.0216 Preprocessor1_Model18
19 2.98e- 1 rmse standard 0.656 5 0.0214 Preprocessor1_Model19
20 1 e+ 0 rmse standard 0.656 5 0.0214 Preprocessor1_Model20
top_lasso_p_models <-
lasso_p_resamples |>
show_best() |>
arrange(penalty)
lasso_p_resamples|>
select_best()# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00785 Preprocessor1_Model16
lasso_p_resamples|>
autoplot()set.seed(904)
lasso_p_pre_wf <-
lasso_p_wf |>
finalize_workflow(select_best(lasso_p_resamples))
lasso_p_pre_wf <-
fit(lasso_p_pre_wf,
data = crime_train)
lasso_p_predictions <-
predict(object = lasso_p_pre_wf,
new_data = crime_test)$.pred
actual_p <-
crime_test$log_property
county <-
crime_test$county
lasso_p_predictions_df <-
tibble(county,actual_p,lasso_p_predictions)
rmse_lasso_p <-
rmse(lasso_p_predictions_df, actual_p, lasso_p_predictions)rf_p_rec <-
recipe(log_property ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_viol,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
rf_p_mod <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 200
) |>
set_mode(mode = "regression") |>
set_engine(
engine = "ranger",
importance = "impurity",
num.threads = 4
)
rf_p_wf <-
workflow() |>
add_recipe(rf_p_rec) |>
add_model(rf_p_mod)
rf_p_grid <-
grid_regular(
mtry(range = c(1, 15)),
min_n(range = c(1, 15)),
levels = 5
)set.seed(904)
rf_p_resamples <-
tune_grid(
rf_p_wf,
resamples = crime_folds,
grid = rf_p_grid)
collect_metrics(rf_p_resamples)# A tibble: 50 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 1 rmse standard 0.497 5 0.0200 Preprocessor1_Model01
2 1 1 rsq standard 0.431 5 0.0107 Preprocessor1_Model01
3 4 1 rmse standard 0.487 5 0.0196 Preprocessor1_Model02
4 4 1 rsq standard 0.452 5 0.0142 Preprocessor1_Model02
5 8 1 rmse standard 0.485 5 0.0190 Preprocessor1_Model03
6 8 1 rsq standard 0.456 5 0.0120 Preprocessor1_Model03
7 11 1 rmse standard 0.485 5 0.0196 Preprocessor1_Model04
8 11 1 rsq standard 0.456 5 0.0130 Preprocessor1_Model04
9 15 1 rmse standard 0.482 5 0.0187 Preprocessor1_Model05
10 15 1 rsq standard 0.463 5 0.0115 Preprocessor1_Model05
# ℹ 40 more rows
show_best(rf_p_resamples,
metric = "rmse") # A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 15 1 rmse standard 0.482 5 0.0187 Preprocessor1_Model05
2 15 4 rmse standard 0.483 5 0.0204 Preprocessor1_Model10
3 11 4 rmse standard 0.483 5 0.0190 Preprocessor1_Model09
4 11 11 rmse standard 0.484 5 0.0198 Preprocessor1_Model19
5 15 15 rmse standard 0.484 5 0.0188 Preprocessor1_Model25
select_best(rf_p_resamples)# A tibble: 1 × 3
mtry min_n .config
<int> <int> <chr>
1 15 1 Preprocessor1_Model05
autoplot(rf_p_resamples)set.seed(904)
rf_p_pre_wf <-
rf_p_wf |>
finalize_workflow(select_best(rf_p_resamples))
rf_p_pre_wf <-
fit(rf_p_pre_wf,
data = crime_train)
rf_p_predictions <-
predict(object = rf_p_pre_wf,
new_data = crime_test)$.pred
actual_p <-
crime_test$log_property
county <-
crime_test$county
rf_p_predictions_df<-
tibble(county,actual_p,rf_p_predictions)
rmse_rf_p <-
rmse(rf_p_predictions_df, actual_p, rf_p_predictions)rf_p_final_wf <-
rf_p_wf |>
finalize_workflow(select_best(rf_p_resamples))
rf_p_fit <-
rf_p_final_wf |>
fit(data = crime_all_2010_cleaned)
set.seed(904)
rf_p_fit|>
extract_fit_parsnip()|>
vip(num_features = 15) %>%
.$data |>
mutate(
Importance = Importance / max(Importance),
Variable = fct_reorder(.x = Importance, .f = Variable)
) |>
ggplot(aes(Importance, Variable)) +
scale_y_discrete(labels = crime_labels) +
labs(title = "Top 15 Variables for Property Crime Prediction") +
geom_col()Based on the result from random forest model, the top 15 important variables for predicting property crime are shown above. Most of the variables are from the socioeconomic data and few are from the land cover data.
# recipe
mars_p_rec <-
recipe(log_property ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_viol,
new_role = "id") |>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
mars_p_rec_sum <- summary(mars_p_rec)
# model
mars_p_mod <-
mars(
num_terms = tune(),
prod_degree = tune()
) |>
set_mode(mode = "regression") |>
set_engine(engine = "earth")
# tuning grid
mars_p_grid <-
grid_regular(
num_terms(range = c(10, 100)),
prod_degree(),
levels = 10
)
# workflow
mars_p_wf <-
workflow() |>
add_recipe(recipe = mars_p_rec) |>
add_model(spec = mars_p_mod)# resample
set.seed(904)
mars_p_resamples <-
mars_p_wf|>
tune_grid(
resamples = crime_folds,
grid = mars_p_grid,
metrics = metric_set(rmse)
)
collect_metrics(mars_p_resamples)# A tibble: 20 × 8
num_terms prod_degree .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 10 1 rmse standard 0.513 5 0.0202 Preprocessor1_M…
2 20 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
3 30 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
4 40 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
5 50 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
6 60 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
7 70 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
8 80 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
9 90 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
10 100 1 rmse standard 0.508 5 0.0201 Preprocessor1_M…
11 10 2 rmse standard 0.506 5 0.0173 Preprocessor1_M…
12 20 2 rmse standard 0.503 5 0.0161 Preprocessor1_M…
13 30 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
14 40 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
15 50 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
16 60 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
17 70 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
18 80 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
19 90 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
20 100 2 rmse standard 0.502 5 0.0166 Preprocessor1_M…
top_mars_p_models <-
mars_p_resamples |>
show_best()
mars_p_resamples|>
select_best()# A tibble: 1 × 3
num_terms prod_degree .config
<int> <int> <chr>
1 30 2 Preprocessor1_Model13
set.seed(904)
mars_p_pre_wf <-
mars_p_wf |>
finalize_workflow(select_best(mars_p_resamples))
mars_p_pre_wf <-
fit(mars_p_pre_wf,
data = crime_train)
mars_p_predictions <-
predict(object = mars_p_pre_wf,
new_data = crime_test)$.pred
actual_p <-
crime_test$log_property
county <-
crime_test$county
mars_p_predictions_df <-
tibble(county,actual_p,mars_p_predictions)
rmse_mars_p <-
rmse(mars_p_predictions_df, actual_p, mars_p_predictions)xgb_p_rec <-
recipe(log_property ~ .,
data = crime_train) |>
update_role(county,
year,
viol,
viol_rate,
property_rate,
property,
log_viol,
new_role = "id")|>
step_impute_median(all_predictors())|>
step_nzv(all_predictors())
all_cores <-
parallel::detectCores(logical = FALSE)
registerDoFuture()
cl <-
makeCluster(all_cores - 3L)
plan(cluster, workers = cl)
xgb_p_spec <-
boost_tree(
trees = 200,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) |>
set_engine("xgboost") |>
set_mode("regression")
set.seed(904)
xgb_p_grid <-
grid_latin_hypercube(
tree_depth(),
min_n(range = c(2, 10)),
loss_reduction(range = c(-5, -3)),
sample_size = sample_prop(),
mtry(range = c(1, 10)),
learn_rate(range = c(-5, -1)),
size = 30
)
xgb_p_wf <- workflow() |>
add_recipe(xgb_p_rec) |>
add_model(xgb_p_spec)set.seed(904)
xgb_p_resamples <-
tune_grid(
xgb_p_wf,
resamples = crime_folds,
grid = xgb_p_grid)
collect_metrics(xgb_p_resamples)# A tibble: 60 × 12
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 4 8 2 0.000393 0.000173 0.675 rmse
2 4 8 2 0.000393 0.000173 0.675 rsq
3 7 2 4 0.000566 0.000970 0.829 rmse
4 7 2 4 0.000566 0.000970 0.829 rsq
5 4 7 9 0.0756 0.0000200 0.111 rmse
6 4 7 9 0.0756 0.0000200 0.111 rsq
7 7 3 14 0.00179 0.000214 0.413 rmse
8 7 3 14 0.00179 0.000214 0.413 rsq
9 5 8 6 0.0000187 0.0000519 0.195 rmse
10 5 8 6 0.0000187 0.0000519 0.195 rsq
# ℹ 50 more rows
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
# .config <chr>
show_best(xgb_p_resamples, metric = "rmse") # A tibble: 5 × 12
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 4 4 13 0.0262 0.0000333 0.922 rmse
2 6 7 6 0.0461 0.0000290 0.547 rmse
3 2 6 5 0.0559 0.000137 0.998 rmse
4 2 9 8 0.0306 0.0000118 0.156 rmse
5 4 7 9 0.0756 0.0000200 0.111 rmse
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
# .config <chr>
select_best(xgb_p_resamples, metric= "rmse")# A tibble: 1 × 7
mtry min_n tree_depth learn_rate loss_reduction sample_size .config
<int> <int> <int> <dbl> <dbl> <dbl> <chr>
1 4 4 13 0.0262 0.0000333 0.922 Preprocessor1_Mo…
set.seed(904)
xgb_p_pre_wf <-
xgb_p_wf |>
finalize_workflow(select_best(xgb_p_resamples))
xgb_p_pre_wf <-
fit(xgb_p_pre_wf,
data = crime_train)
xgb_p_predictions <-
predict(object = xgb_p_pre_wf,
new_data = crime_test)$.pred
actual_p <-
crime_test$log_property
county <-
crime_test$county
xgb_p_predictions_df <-
tibble(county,actual_p,xgb_p_predictions)
rmse_xgb_p <-
rmse(xgb_p_predictions_df, actual_p, xgb_p_predictions)property_comparison <-
bind_rows(rmse_ridge_p, rmse_lasso_p, rmse_rf_p, rmse_mars_p, rmse_xgb_p)
property_comparison|>
mutate(
model = c("Ridge Regression","Lasso Regression",
"Random Forest", "MARS", "XG Boost"))|>
select(model, everything())# A tibble: 5 × 4
model .metric .estimator .estimate
<chr> <chr> <chr> <dbl>
1 Ridge Regression rmse standard 0.601
2 Lasso Regression rmse standard 0.607
3 Random Forest rmse standard 0.474
4 MARS rmse standard 0.510
5 XG Boost rmse standard 0.477
Because random forest has the lowest RMSE value, we will use random forest to predict property crime rate in 2016.
set.seed(904)
rf_p_pre_wf_2016 <-
rf_p_wf |>
finalize_workflow(select_best(rf_p_resamples))
rf_p_pre_wf_2016 <-
fit(rf_p_pre_wf,
data = crime_all_2010_cleaned)
rf_p_predictions_2016 <-
predict(object = rf_p_pre_wf_2016,
new_data = crime_all_2016_cleaned)$.pred
actual_implement_p <- crime_all_2016_cleaned$log_property
county_implement <- crime_all_2016_cleaned$county
rf_p_predictions_2016_df <-
tibble(county_implement,actual_implement_p,rf_p_predictions_2016)
rmse_rf_p_2016 <-
rmse(rf_p_predictions_2016_df, actual_implement_p, rf_p_predictions_2016)
rmse_rf_p_2016# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.459
Final prediction for property crime rate in 2016 has a RMSE of 0.459.