Final Project

Author

Ziwen Lu, Xuyan Xiu, Doris Yan

Code

library(tidyverse)
library(tidymodels)
library(ggplot2)
library(parsnip)
library(doFuture)
library(doParallel)
library(xgboost)
library(vip)
library(haven)
library(here)
library(icpsrdata)
library(plotly)
dir.create("data")

Data Wrangling

ICPSR credential

Reproducible data retrieval

Code

# crime predictive data takes 1-2 mins

icpsr_download(
  file_id = 38649,
  download_dir = here("data")
)

# crime implementation crime data takes 1-2 mins

icpsr_download(
  file_id = 37059,
  download_dir = here("data")
)

# Ses data takes 9-10 mins

icpsr_download(
  file_id = 38528,
  download_dir = here("data")
)

# school counts takes 1-2 mins

icpsr_download(
  file_id = 38569,
  download_dir = here("data")
)

# street connectivity takes 1-2 mins

icpsr_download(
  file_id = 38580,
  download_dir = here("data")
)

# pollution takes 1 min

icpsr_download(
  file_id = 38597,
  download_dir = here("data")
)

### land cover takes 1-2 mins

icpsr_download(
  file_id = 38598,
  download_dir = here("data")
)

Read data

Code

# crime modeling data

crime_predictive <- 
  read_dta(
    here("data",
         "ICPSR_38649",
         "DS0001",
         "38649-0001-Data.dta"))

# crime implementation data

crime_implementation <- 
  read_dta(
    here("data",
         "ICPSR_37059",
         "DS0004",
         "37059-0004-Data.dta"))

# ses predictive data and implementation data

ses08_17 <-
  read_dta(
    here("data",
         "ICPSR_38528",
         "DS0002",
         "38528-0002-Data.dta"))

# school counts data and implementation data

schoolct <- 
  read_dta(
    here("data",
         "ICPSR_38569",
         "DS0001",
         "38569-0001-Data.dta"))

# street connectivity predictive data

streetcon_predictive <- 
  read_dta(
    here("data",
         "ICPSR_38580",
         "DS0001",
         "38580-0001-Data.dta"))

# street connectivity implementation data

streetcon_implementation <- 
  read_dta(
    here("data",
         "ICPSR_38580",
         "DS0003",
         "38580-0003-Data.dta"))

# pollution predictive data and implementation data

pollution <- 
  read_dta(
    here("data",
         "ICPSR_38597",
         "DS0001",
         "38597-0001-Data.dta"))

# land cover predictive data and implementation data

landcover <- 
  read_dta(
    here("data",
         "ICPSR_38598",
         "DS0001",
         "38598-0001-Data.dta"))

Clean data

crime predictive

We use annual measure of crime.

Code

crime_2010 <- 
  crime_predictive|>
  rename_all(tolower)|>
  rename(county = stcofips)|>
  select(county, year, viol, property, cpopcrim)|>
  filter(year == 2010)|>
  
  # generate crime rate per 100,000 people and the log transform
  
  mutate(viol_rate = (viol / cpopcrim) * 100000,
         property_rate = (property / cpopcrim) * 100000)|>
  mutate(log_viol = log(viol_rate),
       log_property = log(property_rate))

crime implementation

Code

crime_2016 <- 
  crime_implementation|>
  rename_all(tolower)|>
  select(fips_st, fips_cty, cpopcrim, viol, property)|>
  
  # concatenate fips_st and fips_cty to make county fips
  
  mutate(
    fips_st = ifelse(fips_st >= 1 & fips_st <= 9, 
                          paste0("0", fips_st), 
                          as.character(fips_st)) 
         )|>
  mutate(
    fips_cty = str_pad(fips_cty, 
                       width = 3, 
                       pad = "0")
         )|>
  mutate(county = str_c(fips_st, fips_cty))|>
  select(-fips_cty, -fips_st)|>
  
  # generate crime rate per 100,000 people and the log transform
  
  mutate(viol_rate = (viol / cpopcrim) * 100000,
         property_rate = (property / cpopcrim) * 100000)|>
  mutate(log_viol = log(viol_rate),
       log_property = log(property_rate))|>
  select(county, everything())

Code

# create a function for effective data cleaning

clean_data<-
  function(data) {
    cleaned_data <- data |> 
      rename_all(tolower)|>
      mutate(
        county = substr((tract_fips10), 1, 5))|>
      select(-tract_fips10)

  return(cleaned_data)
  }

ses

SES predictive data of 2010 is from ACS five-year estimate from 2008 to 2012. SES implementation data of 2016 is from ACS five-years estimate from 2013 to 2016.

Code

# ACS 2008 to 2012 for year 2010

ses_2010 <- 
  clean_data(ses08_17)|>
  group_by(county)|>
  summarise_all(sum, na.rm = TRUE)|>
  pivot_longer(
  cols = !county, 
  names_to = "indicator",
  values_to = "value"
  )|>
  filter(str_sub(indicator, -2) == "12")|>
  
  # pivot SES indicators back as variables
  
  pivot_wider(
    names_from = indicator,
    values_from = value
  )|>
  
  # cpopcrim and totpop08_12 are all total population estimate. 
  # cpopcrim is from crime dataset, totpop08_12 is from ses dataset
  
  # since we are predicting crime rate, it would be more appropriate to use 
  # the population variable in the crime dataset, thus we will use cpopcrim
  # as population estimate

  select(-totpop08_12)
   
# ACS 2013 to 2017 for year 2016

ses_2016 <- 
  clean_data(ses08_17)|>
  group_by(county)|>
  summarise_all(sum, na.rm = TRUE)|>
  pivot_longer(
  cols = !county, 
  names_to = "indicator",
  values_to = "value"
  )|>
  filter(str_sub(indicator, -2) == "17")|>
  
  # pivot SES indicators back as variables
  
  pivot_wider(
    names_from = indicator,
    values_from = value
  )|>
  select(-totpop13_17)

school counts

We use annual school counts.

Code

# school counts at 2016

schoolct_2010 <- 
  clean_data(schoolct)|>
  group_by(county,year)|>
  summarise_all(sum, na.rm = TRUE)|>
  filter(year == 2010)

schoolct_2016 <- 
  clean_data(schoolct)|>
  group_by(county,year)|>
  summarise_all(sum, na.rm = TRUE)|>
  filter(year == 2016)

street connectivity

We use street connectivity data at 2010 for predictive data, and street connectivity data at 2020 for implementation data.

Code

# from 2010 school count data

streetcon_2010 <- 
  clean_data(streetcon_predictive)|>
  group_by(county)|>
  summarise_all(sum, na.rm = TRUE)

# use 2020 data to represent 2016 school counts

streetcon_2016 <- 
  streetcon_implementation|>
  rename_all(tolower)|>
  mutate(county = substr((tract_fips20), 1, 5))|>
  select(-tract_fips20)|>
  group_by(county)|>
  summarise_all(sum, na.rm = TRUE)

pollution

We use annual measure of pollution.

Code

# pollution in 2010

pollution_2010 <-
  clean_data(pollution)|>
  group_by(county,year)|>
  summarise_all(sum, na.rm = TRUE)|>
  filter(year == 2010)

# pollution in 2016

pollution_2016 <-
  clean_data(pollution)|>
  group_by(county,year)|>
  summarise_all(sum, na.rm = TRUE)|>
  filter(year == 2016)

land cover

We use annual interpolation of land cover.

Code

# land cover at 2010

landcover_2010 <- 
  landcover|>
  rename_all(tolower)|>
  mutate(county = substr((tract_fips), 1, 5))|>
  select(-tract_fips, -intp_flag)|>
  group_by(county, year_intp)|>
  summarise_all(sum, na.rm = TRUE) |>
  filter(year_intp == 2010)

# land cover at 2016

landcover_2016 <- landcover|>
  rename_all(tolower)|>
  mutate(county = substr((tract_fips), 1, 5))|>
  select(-tract_fips, -intp_flag)|>
  group_by(county, year_intp)|>
  summarise_all(sum, na.rm = TRUE) |>
  filter(year_intp == 2016)

Combine All Datasets

predictive data

Code

# combine the data sets with the by "county" column

crime_all_2010 <- 
  full_join(
  crime_2010,
  full_join(
    ses_2010,
   full_join(
     schoolct_2010,
    full_join(
      streetcon_2010,
      full_join(
        pollution_2010, landcover_2010, by = "county"
        ),
      by = "county"
      ),
    by = "county"
    ),
   by = "county"
   ),
  by = "county"
  )

crime_all_2010<-crime_all_2010|>
  select(-year.y, -year.x, -year_intp)

implementation data

Code

# combine the data sets with the by "county" and "year" columns

crime_all_2016 <- 
  full_join(
  crime_2016,
  full_join(
    ses_2016,
   full_join(
     schoolct_2016,
    full_join(
      streetcon_2016,
      full_join(
        pollution_2016, landcover_2010, by = "county"
        ),
      by = "county"
      ),
    by = "county"
    ),
   by = "county"
   ),
  by = "county"
  )

crime_all_2016 <- 
  crime_all_2016|>
  select(-year.y, -year.x, -year_intp)|>
  select(county, everything())|>
  mutate(year = 2016)

2010 predictive data EDA and data cleaning

We then removed STATA label and attributes in outcome variables.

Code

# remove STATA format and attributes

attr(crime_all_2010$viol, "format.stata") <- NULL
attr(crime_all_2010$property, "format.stata") <- NULL
attr(crime_all_2010$cpopcrim, "format.stata") <- NULL
attr(crime_all_2010$viol_rate, "format.stata") <- NULL
attr(crime_all_2010$property_rate, "format.stata") <- NULL
attr(crime_all_2010$log_viol, "format.stata") <- NULL
attr(crime_all_2010$log_property, "format.stata") <- NULL


attr(crime_all_2010$viol, "label") <- NULL
attr(crime_all_2010$property, "label") <- NULL
attr(crime_all_2010$cpopcrim, "label") <- NULL
attr(crime_all_2010$viol_rate, "label") <- NULL
attr(crime_all_2010$property_rate, "label") <- NULL
attr(crime_all_2010$log_viol, "label") <- NULL
attr(crime_all_2010$log_property, "label") <- NULL
 
str(crime_all_2010)

tibble [3,276 × 114] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:3276] "01001" "01003" "01005" "01007" ...
  ..- attr(*, "label")= chr "Five-digit FIPS county code"
  ..- attr(*, "format.stata")= chr "%9s"
 $ year                 : num [1:3276] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
  ..- attr(*, "label")= chr "Year that the offenses occurred"
  ..- attr(*, "format.stata")= chr "%4.0f"
 $ viol                 : num [1:3276] 143 385 25 59 24 25 66 511 107 84 ...
 $ property             : num [1:3276] 1521 4161 455 408 305 ...
 $ cpopcrim             : num [1:3276] 54571 182265 27178 22915 57322 ...
 $ viol_rate            : num [1:3276] 262 211.2 92 257.5 41.9 ...
 $ property_rate        : num [1:3276] 2787 2283 1674 1780 532 ...
 $ log_viol             : num [1:3276] 5.57 5.35 4.52 5.55 3.73 ...
 $ log_property         : num [1:3276] 7.93 7.73 7.42 7.48 6.28 ...
 $ popden08_12          : num [1:3276] 9634 12432 1266 184 834 ...
 $ phispanic08_12       : num [1:3276] 0.2796 1.2298 0.4183 0.0722 0.7467 ...
 $ pnhwhite08_12        : num [1:3276] 8.92 26.03 4.41 3.13 7.97 ...
 $ pnhblack08_12        : num [1:3276] 2.5108 2.876 3.9907 0.7032 0.0983 ...
 $ pfborn08_12          : num [1:3276] 0.1699 1.046 0.2123 0.0524 0.4356 ...
 $ ped1_08_12           : num [1:3276] 2.038 3.868 2.191 0.921 2.275 ...
 $ ped2_08_12           : num [1:3276] 7.63 18.93 5.37 2.75 5.65 ...
 $ ped3_08_12           : num [1:3276] 2.334 8.205 1.436 0.328 1.077 ...
 $ pin1b_08_12          : num [1:3276] 0.761 2.018 1.458 0.361 0.751 ...
 $ pin2b_08_12          : num [1:3276] 1.368 4.478 1.54 0.858 1.448 ...
 $ pin3b_08_12          : num [1:3276] 2.26 6.33 1.96 1.04 1.85 ...
 $ pin4b_08_12          : num [1:3276] 4.6 11.07 2.85 1.27 3.48 ...
 $ pin5b_08_12          : num [1:3276] 3.006 7.102 1.189 0.465 1.466 ...
 $ pincgt75k08_12       : num [1:3276] 4.914 11.743 2.036 0.948 2.64 ...
 $ pnvmar08_12          : num [1:3276] 3.21 6.6 2.78 1.01 1.77 ...
 $ p18yr_08_12          : num [1:3276] 3.057 6.79 2.054 0.901 2.189 ...
 $ p18_2908_12          : num [1:3276] 1.765 4.127 1.409 0.601 1.233 ...
 $ p30_3908_12          : num [1:3276] 1.467 3.475 1.242 0.505 1.164 ...
 $ p40_4908_12          : num [1:3276] 1.88 4.303 1.14 0.656 1.268 ...
 $ p50_6908_12          : num [1:3276] 2.83 8.6 2.23 0.94 2.27 ...
 $ pge7008_12           : num [1:3276] 0.999 3.711 0.921 0.398 0.874 ...
 $ punemp08_12          : num [1:3276] 0.672 1.563 0.554 0.228 0.524 ...
 $ pprof08_12           : num [1:3276] 3.61 9.605 2.568 0.782 2.422 ...
 $ ppov08_12            : num [1:3276] 1.508 4.143 2.287 0.635 1.353 ...
 $ ppubas08_12          : num [1:3276] 1.47 2.634 2.031 0.529 1.136 ...
 $ pfhfam08_12          : num [1:3276] 1.35 2.642 1.392 0.319 0.588 ...
 $ pownoc08_12          : num [1:3276] 9.4 23.3 6.14 3.34 7.32 ...
 $ disadvantage08_12    : num [1:3276] 1.502 2.772 2.051 0.483 0.74 ...
 $ disadvantage2_08_12  : num [1:3276] 1.25 2.745 1.566 0.428 0.9 ...
 $ affluence08_12       : num [1:3276] 3.619 9.851 2.013 0.686 2.046 ...
 $ ethnicimmigrant08_12 : num [1:3276] 0.2247 1.1379 0.3153 0.0623 0.5912 ...
 $ all_school           : num [1:3276] 18 55 14 14 24 6 8 47 18 8 ...
 $ elem                 : num [1:3276] 11 37 9 7 16 4 5 26 11 5 ...
 $ middle               : num [1:3276] 7 18 7 5 13 2 4 20 8 6 ...
 $ high                 : num [1:3276] 9 17 7 6 12 3 5 22 8 6 ...
 $ public_school        : num [1:3276] 15 46 11 11 20 5 7 41 14 8 ...
 $ public_trad          : num [1:3276] 15 46 11 11 20 5 7 41 14 8 ...
 $ public_charter       : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet        : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ pub_elem             : num [1:3276] 8 28 6 5 12 3 4 20 7 5 ...
 $ pub_middle           : num [1:3276] 6 11 4 3 10 1 3 15 4 6 ...
 $ pub_high             : num [1:3276] 8 10 4 3 10 2 4 17 4 6 ...
 $ public_trad_elem     : num [1:3276] 8 28 6 5 12 3 4 20 7 5 ...
 $ public_trad_middle   : num [1:3276] 6 11 4 3 10 1 3 15 4 6 ...
 $ public_trad_high     : num [1:3276] 8 10 4 3 10 2 4 17 4 6 ...
 $ public_charter_elem  : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_charter_middle: num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_charter_high  : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet_elem   : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet_middle : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet_high   : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ private_school       : num [1:3276] 3 9 3 3 4 1 1 6 4 0 ...
 $ private_elem         : num [1:3276] 3 9 3 2 4 1 1 6 4 0 ...
 $ private_middle       : num [1:3276] 1 7 3 2 3 1 1 5 4 0 ...
 $ private_high         : num [1:3276] 1 7 3 3 2 1 1 5 4 0 ...
 $ n_streets            : num [1:3276] 8109 36296 7293 8226 13360 ...
 $ sum_strintlen        : num [1:3276] 2276795 7904534 2524658 2251633 3173244 ...
 $ n_nodes              : num [1:3276] 7160 31321 6249 7343 12267 ...
 $ n_realnodes          : num [1:3276] 3116 15005 3199 3450 5231 ...
 $ n_blocks             : num [1:3276] 1887 8587 1820 1777 2750 ...
 $ tract_area_sqmiles   : num [1:3276] 604 1712 905 626 651 ...
 $ linknoderatio        : num [1:3276] 13.96 36.35 10.68 4.47 9.8 ...
 $ intdensity           : num [1:3276] 384.3 733.6 78.3 25.2 74.6 ...
 $ strnetdensity        : num [1:3276] 131227 251716 35572 14991 44563 ...
 $ connoderatio         : num [1:3276] 5.68 15.14 4.71 1.87 3.84 ...
 $ blockdensity         : num [1:3276] 190.7 428.2 49.7 13.4 38.2 ...
 $ avgblocklength       : num [1:3276] 2671 5966 2859 1071 2153 ...
 $ medblocklength       : num [1:3276] 1730 3921 1546 578 1372 ...
 $ gamma                : num [1:3276] 4.68 12.15 3.58 1.49 3.27 ...
 $ alpha                : num [1:3276] 1.001 2.701 0.855 0.236 0.407 ...
 $ count_tri_facilities : num [1:3276] 4 10 5 3 3 0 3 17 4 2 ...
 $ value_0              : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ prop_value_0         : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ value_11             : num [1:3276] 2.19e+07 2.20e+08 5.60e+07 9.77e+06 2.03e+07 ...
 $ prop_value_11        : num [1:3276] 0.1213 2.2523 0.4686 0.0275 0.1046 ...
 $ value_12             : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ prop_value_12        : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ value_21             : num [1:3276] 6.07e+07 2.10e+08 6.54e+07 6.62e+07 8.76e+07 ...
 $ prop_value_21        : num [1:3276] 1.72 3.379 0.369 0.176 0.477 ...
 $ value_22             : num [1:3276] 22333800 85405500 18084000 8225400 22709700 ...
 $ prop_value_22        : num [1:3276] 1.3087 1.8475 0.196 0.0206 0.1229 ...
 $ value_23             : num [1:3276] 6585900 31294200 5006400 2522700 5593500 ...
 $ prop_value_23        : num [1:3276] 0.48646 0.80184 0.07451 0.00617 0.02949 ...
 $ value_24             : num [1:3276] 1817400 8820900 1484100 525300 1422600 ...
 $ prop_value_24        : num [1:3276] 0.14089 0.25293 0.03721 0.00129 0.0073 ...
 $ value_31             : num [1:3276] 2483100 24774300 4538400 2162100 3594300 ...
 $ prop_value_31        : num [1:3276] 0.03998 0.4632 0.01481 0.00498 0.01928 ...
 $ value_41             : num [1:3276] 2.31e+08 4.88e+06 2.06e+08 3.57e+08 5.84e+08 ...
 $ prop_value_41        : num [1:3276] 1.0781 0.0288 0.824 0.953 3.1266 ...
 $ value_42             : num [1:3276] 2.95e+08 1.30e+09 7.33e+08 4.81e+08 1.55e+08 ...
  [list output truncated]
 - attr(*, "label")= chr "National Neighborhood Data Archive (NaNDA): Crimes by County, United States, 200"

Code

# the suffix for SES indicator 2008-2012 and 2013-2017 is different, we remove the suffix so that the variable names in predictor and implementation data are the same
remove_string_2010 <- 
  function(varname){
  sub("08_12","",varname)
}

crime_all_2010_cleaned <- 
  crime_all_2010|>
  
  # eliminate missing values and negative infinity for outcome variables
  
  filter(!is.na(log_viol) | !is.na(log_property)) |>
  filter(!(is.infinite(log_viol) | is.infinite(log_property)))|>
  rename_with(remove_string_2010, contains("08_12"))


# explore missing values

missing_values_2010 <-
  colSums(is.na(crime_all_2010_cleaned))

missing_values_2010 <-
  as.data.frame(missing_values_2010)

# look at the variables with the most missing values, 
# no variable has over 50% missing values

missing_values_percent_2010<-missing_values_2010|>
  mutate(missing_percent = missing_values_2010/nrow(crime_all_2010_cleaned))|>
  arrange(desc(missing_percent))|>
  top_n(10)

2016 implementation data EDA and data cleaning

Repeat the above data cleaning process for implementation data in 2016.

Code

attr(crime_all_2016$viol, "format.stata") <- NULL
attr(crime_all_2016$property, "format.stata") <- NULL
attr(crime_all_2016$cpopcrim, "format.stata") <- NULL
attr(crime_all_2016$viol_rate, "format.stata") <- NULL
attr(crime_all_2016$property_rate, "format.stata") <- NULL
attr(crime_all_2016$log_viol, "format.stata") <- NULL
attr(crime_all_2016$log_property, "format.stata") <- NULL


attr(crime_all_2016$viol, "label") <- NULL
attr(crime_all_2016$property, "label") <- NULL
attr(crime_all_2016$cpopcrim, "label") <- NULL
attr(crime_all_2016$viol_rate, "label") <- NULL
attr(crime_all_2016$property_rate, "label") <- NULL
attr(crime_all_2016$log_viol, "label") <- NULL
attr(crime_all_2016$log_property, "label") <- NULL
 
str(crime_all_2016)

tibble [3,276 × 114] (S3: tbl_df/tbl/data.frame)
 $ county               : chr [1:3276] "01001" "01003" "01005" "01007" ...
 $ cpopcrim             : num [1:3276] 54499 207584 25778 22474 57565 ...
 $ viol                 : num [1:3276] 153 443 114 16 433 ...
 $ property             : num [1:3276] 1722 4057 664 206 1228 ...
 $ viol_rate            : num [1:3276] 280.7 213.4 442.2 71.2 752.2 ...
 $ property_rate        : num [1:3276] 3160 1954 2576 917 2133 ...
 $ log_viol             : num [1:3276] 5.64 5.36 6.09 4.27 6.62 ...
 $ log_property         : num [1:3276] 8.06 7.58 7.85 6.82 7.67 ...
 $ popden13_17          : num [1:3276] 9675 13924 1199 181 838 ...
 $ phispanic13_17       : num [1:3276] 0.3698 1.2415 0.3662 0.0892 0.8895 ...
 $ pnhwhite13_17        : num [1:3276] 8.68 25.81 4.14 3.18 7.8 ...
 $ pnhblack13_17        : num [1:3276] 2.593 3.105 4.306 0.701 0.12 ...
 $ pfborn13_17          : num [1:3276] 0.2289 0.8958 0.2211 0.0443 0.4392 ...
 $ ped1_13_17           : num [1:3276] 1.697 3.31 2.342 0.629 1.894 ...
 $ ped2_13_17           : num [1:3276] 7.54 18.79 5.48 2.77 5.94 ...
 $ ped3_13_17           : num [1:3276] 2.768 8.895 1.178 0.602 1.169 ...
 $ pin1b_13_17          : num [1:3276] 0.999 2.132 1.437 0.314 0.67 ...
 $ pin2b_13_17          : num [1:3276] 1.439 3.141 1.739 0.533 1.307 ...
 $ pin3b_13_17          : num [1:3276] 2.114 5.733 2.196 0.983 1.97 ...
 $ pin4b_13_17          : num [1:3276] 4.26 11.41 2.45 1.43 3.49 ...
 $ pin5b_13_17          : num [1:3276] 3.189 8.584 1.181 0.741 1.562 ...
 $ pincgt75k13_17       : num [1:3276] 5.04 13.3 2.32 1.28 2.72 ...
 $ pnvmar13_17          : num [1:3276] 3.16 7.67 3.06 1.12 1.84 ...
 $ p18yr_13_17          : num [1:3276] 2.872 6.629 1.986 0.882 2.156 ...
 $ p18_2913_17          : num [1:3276] 1.802 3.932 1.436 0.628 1.232 ...
 $ p30_3913_17          : num [1:3276] 1.557 3.43 1.119 0.475 1.018 ...
 $ p40_4913_17          : num [1:3276] 1.637 3.864 1.06 0.574 1.263 ...
 $ p50_6913_17          : num [1:3276] 2.91 9 2.27 1.03 2.32 ...
 $ pge7013_17           : num [1:3276] 1.218 4.148 1.125 0.414 1.016 ...
 $ punemp13_17          : num [1:3276] 0.429 1.075 0.537 0.181 0.207 ...
 $ pprof13_17           : num [1:3276] 3.9 10.37 2.24 1.02 2.5 ...
 $ ppov13_17            : num [1:3276] 1.746 3.991 2.498 0.558 1.479 ...
 $ ppubas13_17          : num [1:3276] 1.816 2.809 2.377 0.526 0.994 ...
 $ pfhfam13_17          : num [1:3276] 1.234 2.044 1.555 0.317 0.634 ...
 $ pownoc13_17          : num [1:3276] 8.9 22.84 5.63 3.09 7.1 ...
 $ disadvantage13_17    : num [1:3276] 1.564 2.605 2.255 0.456 0.687 ...
 $ disadvantage2_13_17  : num [1:3276] 1.306 2.48 1.742 0.395 0.828 ...
 $ affluence13_17       : num [1:3276] 3.901 10.854 1.913 0.969 2.128 ...
 $ ethnicimmigrant13_17 : num [1:3276] 0.2994 1.0686 0.2937 0.0667 0.6644 ...
 $ all_school           : num [1:3276] 18 52 10 11 22 5 8 42 17 8 ...
 $ elem                 : num [1:3276] 10 35 5 6 13 2 5 21 8 5 ...
 $ middle               : num [1:3276] 6 17 5 3 12 2 4 17 7 6 ...
 $ high                 : num [1:3276] 8 16 4 4 13 3 5 18 7 6 ...
 $ public_school        : num [1:3276] 16 45 9 10 20 4 7 39 14 8 ...
 $ public_trad          : num [1:3276] 16 45 9 10 20 4 6 39 14 8 ...
 $ public_charter       : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet        : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
 $ pub_elem             : num [1:3276] 8 28 4 5 11 1 4 18 7 5 ...
 $ pub_middle           : num [1:3276] 6 10 4 2 10 1 3 14 4 6 ...
 $ pub_high             : num [1:3276] 8 9 3 3 11 2 4 15 4 6 ...
 $ public_trad_elem     : num [1:3276] 8 28 4 5 11 1 3 18 7 5 ...
 $ public_trad_middle   : num [1:3276] 6 10 4 2 10 1 2 14 4 6 ...
 $ public_trad_high     : num [1:3276] 8 9 3 3 11 2 3 15 4 6 ...
 $ public_charter_elem  : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_charter_middle: num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_charter_high  : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ public_magnet_elem   : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
 $ public_magnet_middle : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
 $ public_magnet_high   : num [1:3276] 0 0 0 0 0 0 1 0 0 0 ...
 $ private_school       : num [1:3276] 2 7 1 1 2 1 1 3 3 0 ...
 $ private_elem         : num [1:3276] 2 7 1 1 2 1 1 3 1 0 ...
 $ private_middle       : num [1:3276] 0 7 1 1 2 1 1 3 3 0 ...
 $ private_high         : num [1:3276] 0 7 1 1 2 1 1 3 3 0 ...
 $ n_streets            : num [1:3276] 7077 33577 6519 5738 12773 ...
 $ sum_strintlen        : num [1:3276] 2113191 7669943 2430786 1876016 3165251 ...
 $ n_nodes              : num [1:3276] 6159 29072 5528 5129 11740 ...
 $ n_realnodes          : num [1:3276] 2720 13556 2889 2009 4736 ...
 $ n_blocks             : num [1:3276] 1512 6317 1283 1090 2404 ...
 $ tract_area_sqmiles   : num [1:3276] 604 1712 905 626 651 ...
 $ linknoderatio        : num [1:3276] 19.78 50.21 10.77 8.94 17.45 ...
 $ intdensity           : num [1:3276] 525.2 995.9 76.6 44.4 136.1 ...
 $ strnetdensity        : num [1:3276] 187018 366850 35389 30241 82999 ...
 $ connoderatio         : num [1:3276] 8.07 20.43 4.82 3.13 6.52 ...
 $ blockdensity         : num [1:3276] 227.3 458.1 35.6 23.8 71 ...
 $ avgblocklength       : num [1:3276] 4179 8417 3076 2589 3965 ...
 $ medblocklength       : num [1:3276] 2660 5603 1665 1455 2497 ...
 $ gamma                : num [1:3276] 6.64 16.8 3.61 2.99 5.84 ...
 $ alpha                : num [1:3276] 1.437 3.663 0.902 0.477 0.742 ...
 $ count_tri_facilities : num [1:3276] 5 11 6 3 2 1 1 22 5 2 ...
 $ value_0              : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ prop_value_0         : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ value_11             : num [1:3276] 2.19e+07 2.20e+08 5.60e+07 9.77e+06 2.03e+07 ...
 $ prop_value_11        : num [1:3276] 0.1213 2.2523 0.4686 0.0275 0.1046 ...
 $ value_12             : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ prop_value_12        : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
 $ value_21             : num [1:3276] 6.07e+07 2.10e+08 6.54e+07 6.62e+07 8.76e+07 ...
 $ prop_value_21        : num [1:3276] 1.72 3.379 0.369 0.176 0.477 ...
 $ value_22             : num [1:3276] 22333800 85405500 18084000 8225400 22709700 ...
 $ prop_value_22        : num [1:3276] 1.3087 1.8475 0.196 0.0206 0.1229 ...
 $ value_23             : num [1:3276] 6585900 31294200 5006400 2522700 5593500 ...
 $ prop_value_23        : num [1:3276] 0.48646 0.80184 0.07451 0.00617 0.02949 ...
 $ value_24             : num [1:3276] 1817400 8820900 1484100 525300 1422600 ...
 $ prop_value_24        : num [1:3276] 0.14089 0.25293 0.03721 0.00129 0.0073 ...
 $ value_31             : num [1:3276] 2483100 24774300 4538400 2162100 3594300 ...
 $ prop_value_31        : num [1:3276] 0.03998 0.4632 0.01481 0.00498 0.01928 ...
 $ value_41             : num [1:3276] 2.31e+08 4.88e+06 2.06e+08 3.57e+08 5.84e+08 ...
 $ prop_value_41        : num [1:3276] 1.0781 0.0288 0.824 0.953 3.1266 ...
 $ value_42             : num [1:3276] 2.95e+08 1.30e+09 7.33e+08 4.81e+08 1.55e+08 ...
 $ prop_value_42        : num [1:3276] 1.244 6.838 2.748 1.207 0.862 ...
  [list output truncated]
 - attr(*, "label")= chr "Uniform Crime Reporting Program Data: County-Level Detailed Arrest and Offense D"

Code

remove_string_2016 <-
  function(varname){
  sub("13_17","",varname)
}

crime_all_2016_cleaned <-
  crime_all_2016|>
  
  # eliminate missing values for outcome variables
  
  filter(!is.na(log_viol) | !is.na(log_property)) |>
  filter(!(is.infinite(log_viol) | is.infinite(log_property)))|>
  rename_with(remove_string_2016, contains("13_17"))


# explore missing values

missing_values_2016 <- 
  colSums(is.na(crime_all_2016_cleaned))

missing_values_2016 <-
  as.data.frame(missing_values_2016)

# look at the variables with the most missing values, 
# no variable has over 50% missing values

missing_values_percent_2016 <-
  missing_values_2016|>
  mutate(missing_percent = missing_values_2016/nrow(crime_all_2010_cleaned))|>
  arrange(desc(missing_percent))|>
  top_n(10)

Crime Rate Maps

Before starting the machine learning process, we used package plotly to map the average violent and property crime rates across the U.S. from 2002 to 2014, displayed on a state-by-state basis for better understanding.

Build dataset

We generate crime rate as the number of crimes per 100,000 people.

\[ Rate = \left( \frac{Crime}{Population} \right)10 ^ 5 \tag{1}\]

Code

crime_02_14 <- 
  crime_predictive|>
  rename_all(tolower)|>
  rename(county=stcofips)|>
  mutate(viol = murder+rape+robbery+agasslt)|>
  mutate(property = burglry+larceny+mvtheft)|>
  
  select(county, year, viol, property, cpopcrim) |>
  mutate(viol_rate = (viol / cpopcrim) * 100000) |>
  mutate(property_rate = (property / cpopcrim) * 100000) |>
  
  filter(!is.na(viol_rate) | !is.na(property_rate)) |>
  filter(!(is.infinite(viol_rate) | is.infinite(property_rate)))

Extract FIPS codes

We then converted county units to states based on the five-digit FIPS code to more visually compare differences between regions.

Code

crime_02_14$state_fips <- 
  substr(crime_02_14$county, 
         1, 2)

state_fips_to_abb <- 
  tibble(
  state_fips = c("01", "02", "04", "05", "06", 
                 "08", "09", "10", "11", "12", 
                 "13", "15", "16", "17", "18", 
                 "19", "20", "21", "22", "23", 
                 "24", "25", "26", "27", "28", 
                 "29", "30", "31", "32", "33", 
                 "34", "35", "36", "37", "38", 
                 "39", "40", "41", "42", "44", 
                 "45", "46", "47", "48", "49", 
                 "50", "51", "53", "54", "55", "56"),
  state_abb = c("AL", "AK", "AZ", "AR", "CA", "CO", 
                "CT", "DE", "DC", "FL", "GA", "HI", 
                "ID", "IL", "IN", "IA", "KS", "KY", 
                "LA", "ME", "MD", "MA", "MI", "MN", 
                "MS", "MO", "MT", "NE", "NV", "NH", 
                "NJ", "NM", "NY", "NC", "ND", "OH", 
                "OK", "OR", "PA", "RI", "SC", "SD", 
                "TN", "TX", "UT", "VT", "VA", "WA", 
                "WV", "WI", "WY")
)

Choropleth map

Finally, we chose the choropleth map to illustrate the average violent crime and property crime rate in the U.S., 2002 - 2014. To keep the visualization informative, we used plotly package to add hover text and so on for interactivity. There are also other interactive settings for readers, like zoom in and out for details.

Code

# convert county-level data to state-level data, aggregate violent crime data

crime_data_map <- crime_02_14 |>
  group_by(state_fips) |>
  summarise(
    avg_viol_rate = mean(viol_rate, na.rm = TRUE),
    avg_property_rate = mean(property_rate, na.rm = TRUE)
  ) |>
  
  left_join(state_fips_to_abb, 
            by = 'state_fips') |>
  
  mutate(
    hover_violence = paste(state_abb, 
                           '<br>', 
                           "Violent Crimes Rate: ", 
                           round(avg_viol_rate, 2)),
    hover_property = paste(state_abb, 
                           '<br>', 
                           "Property Crimes Rate: ", 
                           round(avg_property_rate, 2))  
  ) |>
  
  pivot_longer(
    cols = c(avg_viol_rate, 
             avg_property_rate),
    names_to = "crime_type",
    values_to = "crime_count"
  )

violent crime rate

Code

# create and output the map for violent crime

violence_map <- 
  plot_geo(crime_data_map |>
             filter(crime_type == "avg_viol_rate"),
           locationmode = 'USA-states') |>
  
  add_trace(
    z = ~crime_count, 
    text = ~hover_violence,  
    hoverinfo = 'text',     
    locations = ~state_abb,
    color = ~crime_count,
    zmin = 0, 
    zmax = 1000,
    colors = c("#1a9641", "#ffffbf", "#fdae61", "#d7191c"),
    colorbar = list(title = "Violent Crimes Rate",
                    tickvals = c(250, 500, 750))
  ) |>
  
  layout(
    title = 'Average US Crimes Rate by State, 2002 - 2014',
    geo = list(
      scope = 'usa',
      projection = list(type = 'albers usa'),
      showlakes = TRUE,
      lakecolor = toRGB('white')
    )
  )

property crime rate

Code

# create and output the map for property crime rate

property_map <- 
  plot_geo(crime_data_map |>
             filter(crime_type == "avg_property_rate"),
           locationmode = 'USA-states') |>
  
  add_trace(
    z = ~crime_count, 
    text = ~hover_property,   
    hoverinfo = 'text',       
    locations = ~state_abb,
    color = ~crime_count,
    zmin = 0, 
    zmax = 5000,
    colors = c("#2b83ba", "#ffffbf", "#fdae61", "#d7191c"),
    colorbar = list(title = "Property Crimes Rate",
                    tickvals = c(1000, 2000, 3000, 4000))
  ) |>
  
  layout(
    title = 'Avergae US Crimes Rate by State, 2002 - 2014',
    geo = list(
      scope = 'usa',
      projection = list(type = 'albers usa'),
      showlakes = TRUE,
      lakecolor = toRGB('white')
    )
  )

Combine the maps

Code

fig <- 
  plotly::subplot(violence_map,
                  property_map,
                  nrows = 2,
                  margin = 0.05)

# print the figure
fig

Areas with high crime rates are indicated by a darker red color. It can be seen that crime rate is higher in the south and near the border. There are also some interesting findings, such as the fact that Texas, which is usually considered to have a high rate of gun ownership, boasts a violent crime rate of less than 300, which could be influenced by population.

Splitting Data

Code

set.seed(904)

crime_split <- 
  initial_split(crime_all_2010_cleaned, prop = 0.8)

crime_train <- 
  training(crime_split)
crime_test <- 
  testing(crime_split)

Cross-validation

Code

# set up 5-fold cross validation

set.seed(904)
crime_folds <- 
  vfold_cv(data = crime_train, v = 5)

Violent Crime

Ridge Regression

feature and target engineering

Code

# create a recipe

ridge_v_rec <- 
  recipe(log_viol ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_property,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())|>
  step_normalize(all_predictors())
 
# check if feature engineering works to remove variables

ridge_v_summary <-
  summary(ridge_v_rec)

# model
ridge_v_mod <- 
  linear_reg(penalty = tune(), mixture = 0) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "glmnet")  

ridge_v_grid <-
  grid_regular(penalty(),
               levels = 20)
# workflow
ridge_v_wf <- 
  workflow() |>
  add_recipe(recipe = ridge_v_rec) |>
  add_model(spec = ridge_v_mod)

choose the best model

Code

ridge_v_resamples <-
  ridge_v_wf |>
  tune_grid(
    resamples = crime_folds,
    grid = ridge_v_grid,
    metrics = metric_set(rmse)
    )

ridge_v_resamples |>
  collect_metrics()

# A tibble: 20 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 1   e-10 rmse    standard   0.776     5  0.0150 Preprocessor1_Model01
 2 3.36e-10 rmse    standard   0.776     5  0.0150 Preprocessor1_Model02
 3 1.13e- 9 rmse    standard   0.776     5  0.0150 Preprocessor1_Model03
 4 3.79e- 9 rmse    standard   0.776     5  0.0150 Preprocessor1_Model04
 5 1.27e- 8 rmse    standard   0.776     5  0.0150 Preprocessor1_Model05
 6 4.28e- 8 rmse    standard   0.776     5  0.0150 Preprocessor1_Model06
 7 1.44e- 7 rmse    standard   0.776     5  0.0150 Preprocessor1_Model07
 8 4.83e- 7 rmse    standard   0.776     5  0.0150 Preprocessor1_Model08
 9 1.62e- 6 rmse    standard   0.776     5  0.0150 Preprocessor1_Model09
10 5.46e- 6 rmse    standard   0.776     5  0.0150 Preprocessor1_Model10
11 1.83e- 5 rmse    standard   0.776     5  0.0150 Preprocessor1_Model11
12 6.16e- 5 rmse    standard   0.776     5  0.0150 Preprocessor1_Model12
13 2.07e- 4 rmse    standard   0.776     5  0.0150 Preprocessor1_Model13
14 6.95e- 4 rmse    standard   0.776     5  0.0150 Preprocessor1_Model14
15 2.34e- 3 rmse    standard   0.776     5  0.0150 Preprocessor1_Model15
16 7.85e- 3 rmse    standard   0.776     5  0.0150 Preprocessor1_Model16
17 2.64e- 2 rmse    standard   0.775     5  0.0148 Preprocessor1_Model17
18 8.86e- 2 rmse    standard   0.768     5  0.0152 Preprocessor1_Model18
19 2.98e- 1 rmse    standard   0.767     5  0.0164 Preprocessor1_Model19
20 1   e+ 0 rmse    standard   0.776     5  0.0172 Preprocessor1_Model20

Code

top_ridge_v_models <-
  ridge_v_resamples |>
  show_best() |>
  arrange(penalty) 

ridge_v_resamples|>
  select_best()

# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1   0.298 Preprocessor1_Model19

Code

ridge_v_resamples|>
  autoplot()

prediction

Code

set.seed(904)
ridge_v_pre_wf <- 
  ridge_v_wf |> 
  finalize_workflow(select_best(ridge_v_resamples))

ridge_v_pre_wf <-
  fit(ridge_v_pre_wf, 
      data = crime_train)

ridge_v_predictions <- 
  predict(object = ridge_v_pre_wf, 
          new_data = crime_test)$.pred

actual <- 
  crime_test$log_viol

county <- 
  crime_test$county

ridge_v_predictions_df <-
  tibble(county,actual,ridge_v_predictions)

rmse_ridge_v<-rmse(ridge_v_predictions_df, actual, ridge_v_predictions)

Lasso

feature and target engineering

Code

lasso_v_rec <- 
  recipe(log_viol ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_property,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())|>
  step_normalize(all_predictors())

# check if feature engineering works to remove variables

lasso_v_summary <- 
  summary(lasso_v_rec)

# create a model

lasso_v_mod <- 
  linear_reg(penalty = tune(), mixture = 1) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "glmnet")  
 
lasso_v_grid <-
  grid_regular(penalty(),
               levels = 20)

lasso_v_wf <-
  workflow() |>
  add_recipe(recipe = lasso_v_rec) |>
  add_model(spec = lasso_v_mod)

choose the best model

Code

set.seed(904)
lasso_v_resamples <- 
  lasso_v_wf|>
  tune_grid(
    resamples = crime_folds,
    grid = lasso_v_grid,
    metrics = metric_set(rmse)
    )

lasso_v_resamples |>
  collect_metrics()

# A tibble: 20 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 1   e-10 rmse    standard   0.861     5  0.0470 Preprocessor1_Model01
 2 3.36e-10 rmse    standard   0.861     5  0.0470 Preprocessor1_Model02
 3 1.13e- 9 rmse    standard   0.861     5  0.0470 Preprocessor1_Model03
 4 3.79e- 9 rmse    standard   0.861     5  0.0470 Preprocessor1_Model04
 5 1.27e- 8 rmse    standard   0.861     5  0.0470 Preprocessor1_Model05
 6 4.28e- 8 rmse    standard   0.861     5  0.0470 Preprocessor1_Model06
 7 1.44e- 7 rmse    standard   0.861     5  0.0470 Preprocessor1_Model07
 8 4.83e- 7 rmse    standard   0.861     5  0.0470 Preprocessor1_Model08
 9 1.62e- 6 rmse    standard   0.861     5  0.0470 Preprocessor1_Model09
10 5.46e- 6 rmse    standard   0.861     5  0.0470 Preprocessor1_Model10
11 1.83e- 5 rmse    standard   0.861     5  0.0470 Preprocessor1_Model11
12 6.16e- 5 rmse    standard   0.858     5  0.0471 Preprocessor1_Model12
13 2.07e- 4 rmse    standard   0.829     5  0.0306 Preprocessor1_Model13
14 6.95e- 4 rmse    standard   0.790     5  0.0190 Preprocessor1_Model14
15 2.34e- 3 rmse    standard   0.772     5  0.0139 Preprocessor1_Model15
16 7.85e- 3 rmse    standard   0.766     5  0.0159 Preprocessor1_Model16
17 2.64e- 2 rmse    standard   0.776     5  0.0159 Preprocessor1_Model17
18 8.86e- 2 rmse    standard   0.793     5  0.0166 Preprocessor1_Model18
19 2.98e- 1 rmse    standard   0.827     5  0.0166 Preprocessor1_Model19
20 1   e+ 0 rmse    standard   0.827     5  0.0166 Preprocessor1_Model20

Code

top_lasso_v_models <-
  lasso_v_resamples |>
  show_best() |>
  arrange(penalty) 

lasso_v_resamples|>
  select_best()

# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1 0.00785 Preprocessor1_Model16

Code

lasso_v_resamples|>
  autoplot()

prediction

Code

set.seed(904)
lasso_v_pre_wf <- 
  lasso_v_wf |> 
  finalize_workflow(select_best(lasso_v_resamples))

lasso_v_pre_wf <-
  fit(lasso_v_pre_wf, 
      data = crime_train)

lasso_v_predictions <- 
  predict(object = lasso_v_pre_wf, 
          new_data = crime_test)$.pred

actual <- crime_test$log_viol

county <- crime_test$county

lasso_v_predictions_df <-
  tibble(county,actual,lasso_v_predictions)

rmse_lasso_v <- 
  rmse(lasso_v_predictions_df, actual, lasso_v_predictions)

Random Forest

feature and target engineering

Code

rf_v_rec <- 
  recipe(log_viol ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_property,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

rf_v_mod <- 
  rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = 200
  ) |>
  set_mode(mode = "regression") |>
  set_engine(
    engine = "ranger", 
    importance = "impurity",
    num.threads = 4
  )

rf_v_wf <- 
  workflow() |>
  add_recipe(rf_v_rec) |>
  add_model(rf_v_mod)

rf_v_grid <- grid_regular(
  mtry(range = c(1, 15)),
  min_n(range = c(1, 15)),
  levels = 5
)

choose the best model

Code

set.seed(904)
rf_v_resamples <- 
  tune_grid(
  rf_v_wf,
  resamples = crime_folds,
  grid = rf_v_grid)

collect_metrics(rf_v_resamples)

# A tibble: 50 × 8
    mtry min_n .metric .estimator  mean     n std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1     1     1 rmse    standard   0.680     5  0.0144 Preprocessor1_Model01
 2     1     1 rsq     standard   0.341     5  0.0198 Preprocessor1_Model01
 3     4     1 rmse    standard   0.654     5  0.0133 Preprocessor1_Model02
 4     4     1 rsq     standard   0.383     5  0.0184 Preprocessor1_Model02
 5     8     1 rmse    standard   0.647     5  0.0131 Preprocessor1_Model03
 6     8     1 rsq     standard   0.393     5  0.0199 Preprocessor1_Model03
 7    11     1 rmse    standard   0.643     5  0.0132 Preprocessor1_Model04
 8    11     1 rsq     standard   0.401     5  0.0194 Preprocessor1_Model04
 9    15     1 rmse    standard   0.643     5  0.0125 Preprocessor1_Model05
10    15     1 rsq     standard   0.399     5  0.0189 Preprocessor1_Model05
# ℹ 40 more rows

Code

show_best(rf_v_resamples, 
          metric = "rmse")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1    15     8 rmse    standard   0.643     5  0.0131 Preprocessor1_Model15
2    11     1 rmse    standard   0.643     5  0.0132 Preprocessor1_Model04
3    15     1 rmse    standard   0.643     5  0.0125 Preprocessor1_Model05
4    15    11 rmse    standard   0.644     5  0.0131 Preprocessor1_Model20
5    11     4 rmse    standard   0.645     5  0.0139 Preprocessor1_Model09

Code

select_best(rf_v_resamples)

# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1    15     8 Preprocessor1_Model15

Code

autoplot(rf_v_resamples)

predictions

Code

set.seed(904)
rf_v_pre_wf <- 
  rf_v_wf |> 
  finalize_workflow(select_best(rf_v_resamples))

rf_v_pre_wf<-
  fit(rf_v_pre_wf, 
      data = crime_train)

rf_v_predictions <- 
  predict(object = rf_v_pre_wf, 
          new_data = crime_test)$.pred

actual <- 
  crime_test$log_viol

county <- 
  crime_test$county

rf_v_predictions_df<-
  tibble(county,actual,rf_v_predictions)

rmse_rf_v <- 
  rmse(rf_v_predictions_df, actual, rf_v_predictions)

selecting the top 15 important variables

Code

set.seed(904)
rf_v_final_wf <- 
  rf_v_wf |> 
  finalize_workflow(select_best(rf_v_resamples))

rf_v_fit <- 
  rf_v_final_wf |>
  fit(data = crime_all_2010_cleaned)

crime_labels <- c(
  pnhblack = "Proportion of people non-Hispanic Black",
  pin1b_ = "Proportion of families with Income less than 15K",
  disadvantage  = "Disadvantage1",
  ppubas = "Prop. of households with public assistance income",
  ppov = "Prop. ppl w/ income past 12 months below poverty level",
  value_52 = "Road covered by Shrub/Scrub (sqm)",
  pfhfam = "Proportion female-headed families w/ kids",
  disadvantage2_ = "Disadvantage2",
  prop_value_41 = "Prop. road covered by deciduous Forest (sqm)",
  pnhwhite = "Proportion of people non-Hispanic White",
  ped1_ = "Prop. w/ less than high school diploma",
  prop_value_52 = "Prop. road covered by Shrub/Scrub (sqm)",
  phispanic = "Proportion of people of Hispanic origin",
  pin2b_ = "Proportion of families with Income 15-30K",
  pnwhite = "Proportion of people non-Hispanic White",
  value_41 = "Road covered by deciduous Forest (sqm)",
  punemp = "Prop. 16+ civ labor force unemployed",
  prop_value_21 = "Prop. Developed, Open Space (sqm)",
  value_24 = "Road Developed, High Intensity (sqm)",
  prop_value_24 = "Prop road Developed, High Intensity (sqm)",
  prop_value_23 = "Prop. Road Developed, Medium Intensity (sqm)",
  prop_value_22 = "Prop. Road Developed, Low Intensity (sqm)",
  p18_29  = "Proportion of population 18-29 years of age",
  popden = "Persons per square mile"
)

rf_v_fit|>
  extract_fit_parsnip()|>
  vip(num_features = 15) %>%
  .$data |>
  mutate(
    Importance = Importance / max(Importance),
    Variable = fct_reorder(.x = Importance, .f = Variable)
  ) |>
  ggplot(aes(Importance, Variable)) +
  scale_y_discrete(labels = crime_labels) +
  labs(title = "Top 15 Variables for Violent Crime Prediction") +
  geom_col()

Based on the result from random forest model, the top 15 important variables for predicting violent crime are above. Most of the variables are from the socioeconomic data and few are from the land cover data.

Disadvantage1 is the mean of proportion of people Non-Hispanic Black, proportion female-headed families with kids, proportion of households with public assistance income, proportion people with income past 12 months below poverty level, and proportion 16+ civilian labor force unemployed.

Disadvantage2 is the mean of proportion female-headed families with kids, proportion of households with public assistance income, proportion people with income past 12 months below poverty level, and proportion 16+ civilian labor force unemployed.

MARS

feature and target engineering

Code

# recipe

mars_v_rec <- 
  recipe(log_viol ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_property,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

mars_v_rec_sum <-
  summary(mars_v_rec)

# model

mars_v_mod <- 
  mars(
  num_terms = tune(), 
  prod_degree = tune()
) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "earth")

# tuning grid

mars_v_grid <- 
  grid_regular(
  num_terms(range = c(10, 100)),
  prod_degree(),
  levels = 10
)

# workflow

mars_v_wf <- 
  workflow() |>
  add_recipe(recipe = mars_v_rec) |>
  add_model(spec = mars_v_mod)

choose the best model

Code

# resample   

set.seed(904)
mars_v_resamples <- 
  mars_v_wf|>
  tune_grid(
  resamples = crime_folds,
  grid = mars_v_grid,
  metrics = metric_set(rmse)
  )

collect_metrics(mars_v_resamples)

# A tibble: 20 × 8
   num_terms prod_degree .metric .estimator  mean     n std_err .config         
       <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
 1        10           1 rmse    standard   0.679     5 0.0107  Preprocessor1_M…
 2        20           1 rmse    standard   0.668     5 0.0123  Preprocessor1_M…
 3        30           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 4        40           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 5        50           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 6        60           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 7        70           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 8        80           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
 9        90           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
10       100           1 rmse    standard   0.668     5 0.0121  Preprocessor1_M…
11        10           2 rmse    standard   0.674     5 0.00990 Preprocessor1_M…
12        20           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
13        30           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
14        40           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
15        50           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
16        60           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
17        70           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
18        80           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
19        90           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…
20       100           2 rmse    standard   0.663     5 0.00751 Preprocessor1_M…

Code

top_mars_v_models <-
  mars_v_resamples |>
  show_best()

mars_v_resamples|>
  select_best()

# A tibble: 1 × 3
  num_terms prod_degree .config              
      <int>       <int> <chr>                
1        20           2 Preprocessor1_Model12

prediction

Code

set.seed(904)
mars_v_pre_wf <- 
  mars_v_wf |> 
  finalize_workflow(select_best(mars_v_resamples))

mars_v_pre_wf<-
  fit(mars_v_pre_wf, 
      data = crime_train)

mars_v_predictions <- 
  predict(object = mars_v_pre_wf, 
          new_data = crime_test)$.pred

actual<-crime_test$log_viol

county<-crime_test$county

mars_v_predictions_df<-
  tibble(county,actual,mars_v_predictions)

rmse_mars_v<-rmse(mars_v_predictions_df, actual, mars_v_predictions)

XG Boost

feature and target engineering

Code

xgb_v_rec <- 
  recipe(log_viol ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_property,
              new_role = "id")|>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

all_cores <- 
  parallel::detectCores(logical = FALSE)

registerDoFuture()

cl <- 
  makeCluster(all_cores - 3L)

plan(cluster, workers = cl)

xgb_v_spec <- 
  boost_tree(
  trees = 200,
  tree_depth = tune(), 
  min_n = tune(),
  loss_reduction = tune(),                     
  sample_size = tune(), 
  mtry = tune(),         
  learn_rate = tune()                   
) |>
  set_engine("xgboost") |>
  set_mode("regression")

set.seed(904)
xgb_v_grid <- 
  grid_latin_hypercube(
  tree_depth(),
  min_n(range = c(2, 10)),
  loss_reduction(range = c(-5, -3)),
  sample_size = sample_prop(),
  mtry(range = c(1, 10)),
  learn_rate(range = c(-5, -1)),
  size = 30
)

xgb_v_wf <- workflow() |>
  add_recipe(xgb_v_rec) |>
  add_model(xgb_v_spec)

choose the best model

Code

set.seed(904)
xgb_v_resamples <- 
  tune_grid(
  xgb_v_wf,
  resamples = crime_folds,
  grid = xgb_v_grid)

collect_metrics(xgb_v_resamples)

# A tibble: 60 × 12
    mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
 1     4     8          2  0.000393       0.000173        0.675 rmse   
 2     4     8          2  0.000393       0.000173        0.675 rsq    
 3     7     2          4  0.000566       0.000970        0.829 rmse   
 4     7     2          4  0.000566       0.000970        0.829 rsq    
 5     4     7          9  0.0756         0.0000200       0.111 rmse   
 6     4     7          9  0.0756         0.0000200       0.111 rsq    
 7     7     3         14  0.00179        0.000214        0.413 rmse   
 8     7     3         14  0.00179        0.000214        0.413 rsq    
 9     5     8          6  0.0000187      0.0000519       0.195 rmse   
10     5     8          6  0.0000187      0.0000519       0.195 rsq    
# ℹ 50 more rows
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

Code

show_best(xgb_v_resamples, metric = "rmse")

# A tibble: 5 × 12
   mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
  <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
1     6     7          6     0.0461      0.0000290       0.547 rmse   
2     4     4         13     0.0262      0.0000333       0.922 rmse   
3     2     6          5     0.0559      0.000137        0.998 rmse   
4     2     9          8     0.0306      0.0000118       0.156 rmse   
5    10     3          4     0.0167      0.0000631       0.631 rmse   
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

Code

select_best(xgb_v_resamples, metric= "rmse")

# A tibble: 1 × 7
   mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
  <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>            
1     6     7          6     0.0461      0.0000290       0.547 Preprocessor1_Mo…

prediction

Code

set.seed(904)
xgb_v_pre_wf <- 
  xgb_v_wf |> 
  finalize_workflow(select_best(xgb_v_resamples))

xgb_v_pre_wf <-
  fit(xgb_v_pre_wf, 
      data = crime_train)

xgb_v_predictions <- 
  predict(object = xgb_v_pre_wf, 
          new_data = crime_test)$.pred

actual <- 
  crime_test$log_viol

county <-
  crime_test$county

xgb_v_predictions_df <-
  tibble(county,actual,xgb_v_predictions)

rmse_xgb_v <- 
  rmse(xgb_v_predictions_df, actual, xgb_v_predictions)

Final comparison of rmse

Code

violence_comparison <-
  bind_rows(rmse_ridge_v, rmse_lasso_v, rmse_rf_v, rmse_mars_v, rmse_xgb_v)

violence_comparison|>
  mutate(
    model = c("Ridge Regression","Lasso Regression",
              "Random Forest", "MARS", "XG Boost"))|>
  select(model, everything())

# A tibble: 5 × 4
  model            .metric .estimator .estimate
  <chr>            <chr>   <chr>          <dbl>
1 Ridge Regression rmse    standard       0.822
2 Lasso Regression rmse    standard       0.817
3 Random Forest    rmse    standard       0.686
4 MARS             rmse    standard       0.712
5 XG Boost         rmse    standard       0.674

Because XG Boost has the lowest RMSE value, we will use XG boost to predict violent crime rate in 2016.

Prediction on implementation violent crime data in 2016

Code

set.seed(904)
xgb_v_pre_wf_2016 <- 
  xgb_v_wf |> 
  finalize_workflow(select_best(xgb_v_resamples))

xgb_v_pre_wf_2016 <-
  fit(xgb_v_pre_wf, 
      data = crime_all_2010_cleaned)

xgb_v_predictions_2016 <- 
  predict(object = xgb_v_pre_wf_2016, 
          new_data = crime_all_2016_cleaned)$.pred

actual_implement_v <- crime_all_2016_cleaned$log_viol

county_implement <- crime_all_2016_cleaned$county

xgb_v_predictions_2016_df <-
  tibble(county_implement,actual_implement_v,xgb_v_predictions_2016)

rmse_xgb_v_2016 <-
  rmse(xgb_v_predictions_2016_df, actual_implement_v, xgb_v_predictions_2016)

rmse_xgb_v_2016

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.593

Final prediction for violent crime rate in 2016 has a RMSE of 0.593.

Property Crime

Ridge Regression

feature and target engineering

Code

# create a recipe

ridge_p_rec <- 
  recipe(log_property ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_viol,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())|>
  step_normalize(all_predictors())
 
# check if feature engineering works to remove variables

ridge_p_summary <-
  summary(ridge_p_rec)

# model

ridge_p_mod <- 
  linear_reg(penalty = tune(), mixture = 0) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "glmnet")  

ridge_p_grid<-
  grid_regular(penalty(),
               levels = 20)
# workflow

ridge_p_wf <- 
  workflow() |>
  add_recipe(recipe = ridge_p_rec) |>
  add_model(spec = ridge_p_mod)

choose the best model

Code

ridge_p_resamples <-
  ridge_p_wf |>
  tune_grid(
    resamples = crime_folds,
    grid = ridge_p_grid,
    metrics = metric_set(rmse)
    )

ridge_p_resamples |>
  collect_metrics()

# A tibble: 20 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 1   e-10 rmse    standard   0.606     5  0.0174 Preprocessor1_Model01
 2 3.36e-10 rmse    standard   0.606     5  0.0174 Preprocessor1_Model02
 3 1.13e- 9 rmse    standard   0.606     5  0.0174 Preprocessor1_Model03
 4 3.79e- 9 rmse    standard   0.606     5  0.0174 Preprocessor1_Model04
 5 1.27e- 8 rmse    standard   0.606     5  0.0174 Preprocessor1_Model05
 6 4.28e- 8 rmse    standard   0.606     5  0.0174 Preprocessor1_Model06
 7 1.44e- 7 rmse    standard   0.606     5  0.0174 Preprocessor1_Model07
 8 4.83e- 7 rmse    standard   0.606     5  0.0174 Preprocessor1_Model08
 9 1.62e- 6 rmse    standard   0.606     5  0.0174 Preprocessor1_Model09
10 5.46e- 6 rmse    standard   0.606     5  0.0174 Preprocessor1_Model10
11 1.83e- 5 rmse    standard   0.606     5  0.0174 Preprocessor1_Model11
12 6.16e- 5 rmse    standard   0.606     5  0.0174 Preprocessor1_Model12
13 2.07e- 4 rmse    standard   0.606     5  0.0174 Preprocessor1_Model13
14 6.95e- 4 rmse    standard   0.606     5  0.0174 Preprocessor1_Model14
15 2.34e- 3 rmse    standard   0.606     5  0.0174 Preprocessor1_Model15
16 7.85e- 3 rmse    standard   0.606     5  0.0174 Preprocessor1_Model16
17 2.64e- 2 rmse    standard   0.605     5  0.0178 Preprocessor1_Model17
18 8.86e- 2 rmse    standard   0.602     5  0.0199 Preprocessor1_Model18
19 2.98e- 1 rmse    standard   0.605     5  0.0213 Preprocessor1_Model19
20 1   e+ 0 rmse    standard   0.613     5  0.0216 Preprocessor1_Model20

Code

top_ridge_p_models <-
  ridge_p_resamples |>
  show_best() |>
  arrange(penalty) 

ridge_p_resamples|>
  select_best()

# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1  0.0886 Preprocessor1_Model18

Code

ridge_p_resamples|>
  autoplot()

prediction

Code

set.seed(904)
ridge_p_pre_wf <- 
  ridge_p_wf |> 
  finalize_workflow(select_best(ridge_p_resamples))

ridge_p_pre_wf <-
  fit(ridge_p_pre_wf, 
      data = crime_train)

ridge_p_predictions <- 
  predict(object = ridge_p_pre_wf, 
          new_data = crime_test)$.pred

actual_p <- 
  crime_test$log_property

county <- 
  crime_test$county

ridge_p_predictions_df <-
  tibble(county,actual_p,ridge_p_predictions)

rmse_ridge_p <- 
  rmse(ridge_p_predictions_df, actual_p, ridge_p_predictions)

Lasso

feature and target engineering

Code

lasso_p_rec <- 
  recipe(log_property ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_viol,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())|>
  step_normalize(all_predictors())

# check if feature engineering works to remove variables

lasso_p_summary <-
  summary(lasso_p_rec)

# create a model

lasso_p_mod <- 
  linear_reg(penalty = tune(), mixture = 1) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "glmnet")  
 
lasso_p_grid <-
  grid_regular(penalty(),
               levels = 20)
lasso_p_wf <-
  workflow() |>
  add_recipe(recipe = lasso_p_rec) |>
  add_model(spec = lasso_p_mod)

choose the best model

Code

set.seed(904)
lasso_p_resamples <- 
  lasso_p_wf|>
  tune_grid(
    resamples = crime_folds,
    grid = lasso_p_grid,
    metrics = metric_set(rmse)
    )

lasso_p_resamples |>
  collect_metrics()

# A tibble: 20 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 1   e-10 rmse    standard   0.678     5  0.0402 Preprocessor1_Model01
 2 3.36e-10 rmse    standard   0.678     5  0.0402 Preprocessor1_Model02
 3 1.13e- 9 rmse    standard   0.678     5  0.0402 Preprocessor1_Model03
 4 3.79e- 9 rmse    standard   0.678     5  0.0402 Preprocessor1_Model04
 5 1.27e- 8 rmse    standard   0.678     5  0.0402 Preprocessor1_Model05
 6 4.28e- 8 rmse    standard   0.678     5  0.0402 Preprocessor1_Model06
 7 1.44e- 7 rmse    standard   0.678     5  0.0402 Preprocessor1_Model07
 8 4.83e- 7 rmse    standard   0.678     5  0.0402 Preprocessor1_Model08
 9 1.62e- 6 rmse    standard   0.678     5  0.0402 Preprocessor1_Model09
10 5.46e- 6 rmse    standard   0.678     5  0.0402 Preprocessor1_Model10
11 1.83e- 5 rmse    standard   0.678     5  0.0402 Preprocessor1_Model11
12 6.16e- 5 rmse    standard   0.665     5  0.0325 Preprocessor1_Model12
13 2.07e- 4 rmse    standard   0.637     5  0.0199 Preprocessor1_Model13
14 6.95e- 4 rmse    standard   0.611     5  0.0174 Preprocessor1_Model14
15 2.34e- 3 rmse    standard   0.607     5  0.0177 Preprocessor1_Model15
16 7.85e- 3 rmse    standard   0.604     5  0.0204 Preprocessor1_Model16
17 2.64e- 2 rmse    standard   0.611     5  0.0212 Preprocessor1_Model17
18 8.86e- 2 rmse    standard   0.630     5  0.0216 Preprocessor1_Model18
19 2.98e- 1 rmse    standard   0.656     5  0.0214 Preprocessor1_Model19
20 1   e+ 0 rmse    standard   0.656     5  0.0214 Preprocessor1_Model20

Code

top_lasso_p_models <-
  lasso_p_resamples |>
  show_best() |>
  arrange(penalty) 

lasso_p_resamples|>
  select_best()

# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1 0.00785 Preprocessor1_Model16

Code

lasso_p_resamples|>
  autoplot()

prediction

Code

set.seed(904)
lasso_p_pre_wf <- 
  lasso_p_wf |> 
  finalize_workflow(select_best(lasso_p_resamples))

lasso_p_pre_wf <-
  fit(lasso_p_pre_wf, 
      data = crime_train)

lasso_p_predictions <- 
  predict(object = lasso_p_pre_wf, 
          new_data = crime_test)$.pred

actual_p <- 
  crime_test$log_property

county <- 
  crime_test$county

lasso_p_predictions_df <-
  tibble(county,actual_p,lasso_p_predictions)

rmse_lasso_p <- 
  rmse(lasso_p_predictions_df, actual_p, lasso_p_predictions)

Random Forest

feature and target engineering

Code

rf_p_rec <- 
  recipe(log_property ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_viol,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

rf_p_mod <- 
  rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = 200
  ) |>
  set_mode(mode = "regression") |>
  set_engine(
    engine = "ranger", 
    importance = "impurity",
    num.threads = 4
  )

rf_p_wf <- 
  workflow() |>
  add_recipe(rf_p_rec) |>
  add_model(rf_p_mod)

rf_p_grid <- 
  grid_regular(
  mtry(range = c(1, 15)),
  min_n(range = c(1, 15)),
  levels = 5
)

choose the best model

Code

set.seed(904)
rf_p_resamples <- 
  tune_grid(
  rf_p_wf,
  resamples = crime_folds,
  grid = rf_p_grid)

collect_metrics(rf_p_resamples)

# A tibble: 50 × 8
    mtry min_n .metric .estimator  mean     n std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1     1     1 rmse    standard   0.497     5  0.0200 Preprocessor1_Model01
 2     1     1 rsq     standard   0.431     5  0.0107 Preprocessor1_Model01
 3     4     1 rmse    standard   0.487     5  0.0196 Preprocessor1_Model02
 4     4     1 rsq     standard   0.452     5  0.0142 Preprocessor1_Model02
 5     8     1 rmse    standard   0.485     5  0.0190 Preprocessor1_Model03
 6     8     1 rsq     standard   0.456     5  0.0120 Preprocessor1_Model03
 7    11     1 rmse    standard   0.485     5  0.0196 Preprocessor1_Model04
 8    11     1 rsq     standard   0.456     5  0.0130 Preprocessor1_Model04
 9    15     1 rmse    standard   0.482     5  0.0187 Preprocessor1_Model05
10    15     1 rsq     standard   0.463     5  0.0115 Preprocessor1_Model05
# ℹ 40 more rows

Code

show_best(rf_p_resamples, 
          metric = "rmse")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1    15     1 rmse    standard   0.482     5  0.0187 Preprocessor1_Model05
2    15     4 rmse    standard   0.483     5  0.0204 Preprocessor1_Model10
3    11     4 rmse    standard   0.483     5  0.0190 Preprocessor1_Model09
4    11    11 rmse    standard   0.484     5  0.0198 Preprocessor1_Model19
5    15    15 rmse    standard   0.484     5  0.0188 Preprocessor1_Model25

Code

select_best(rf_p_resamples)

# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1    15     1 Preprocessor1_Model05

Code

autoplot(rf_p_resamples)

predictions

Code

set.seed(904)
rf_p_pre_wf <- 
  rf_p_wf |> 
  finalize_workflow(select_best(rf_p_resamples))

rf_p_pre_wf <-
  fit(rf_p_pre_wf, 
      data = crime_train)

rf_p_predictions <- 
  predict(object = rf_p_pre_wf, 
          new_data = crime_test)$.pred

actual_p <-
  crime_test$log_property

county <- 
  crime_test$county

rf_p_predictions_df<-
  tibble(county,actual_p,rf_p_predictions)

rmse_rf_p <- 
  rmse(rf_p_predictions_df, actual_p, rf_p_predictions)

selecting the top 15 important variables

Code

rf_p_final_wf <- 
  rf_p_wf |> 
  finalize_workflow(select_best(rf_p_resamples))

rf_p_fit <- 
  rf_p_final_wf |>
  fit(data = crime_all_2010_cleaned)

set.seed(904)
rf_p_fit|>
  extract_fit_parsnip()|>
  vip(num_features = 15) %>%
  .$data |>
  mutate(
    Importance = Importance / max(Importance),
    Variable = fct_reorder(.x = Importance, .f = Variable)
  ) |>
  ggplot(aes(Importance, Variable)) +
  scale_y_discrete(labels = crime_labels) +
  labs(title = "Top 15 Variables for Property Crime Prediction") +
  geom_col()

Based on the result from random forest model, the top 15 important variables for predicting property crime are shown above. Most of the variables are from the socioeconomic data and few are from the land cover data.

MARS

feature and target engineering

Code

# recipe

mars_p_rec <- 
  recipe(log_property ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_viol,
              new_role = "id") |>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

mars_p_rec_sum <- summary(mars_p_rec)

# model

mars_p_mod <- 
  mars(
  num_terms = tune(), 
  prod_degree = tune()
) |>
  set_mode(mode = "regression") |>
  set_engine(engine = "earth")

# tuning grid

mars_p_grid <- 
  grid_regular(
  num_terms(range = c(10, 100)),
  prod_degree(),
  levels = 10
)

# workflow

mars_p_wf <- 
  workflow() |>
  add_recipe(recipe = mars_p_rec) |>
  add_model(spec = mars_p_mod)

choose the best model

Code

# resample   

set.seed(904)
mars_p_resamples <- 
  mars_p_wf|>
  tune_grid(
  resamples = crime_folds,
  grid = mars_p_grid,
  metrics = metric_set(rmse)
  )

collect_metrics(mars_p_resamples)

# A tibble: 20 × 8
   num_terms prod_degree .metric .estimator  mean     n std_err .config         
       <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
 1        10           1 rmse    standard   0.513     5  0.0202 Preprocessor1_M…
 2        20           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 3        30           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 4        40           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 5        50           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 6        60           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 7        70           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 8        80           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
 9        90           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
10       100           1 rmse    standard   0.508     5  0.0201 Preprocessor1_M…
11        10           2 rmse    standard   0.506     5  0.0173 Preprocessor1_M…
12        20           2 rmse    standard   0.503     5  0.0161 Preprocessor1_M…
13        30           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
14        40           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
15        50           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
16        60           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
17        70           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
18        80           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
19        90           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…
20       100           2 rmse    standard   0.502     5  0.0166 Preprocessor1_M…

Code

top_mars_p_models <-
  mars_p_resamples |>
  show_best()

mars_p_resamples|>
  select_best()

# A tibble: 1 × 3
  num_terms prod_degree .config              
      <int>       <int> <chr>                
1        30           2 Preprocessor1_Model13

prediction

Code

set.seed(904)
mars_p_pre_wf <- 
  mars_p_wf |> 
  finalize_workflow(select_best(mars_p_resamples))

mars_p_pre_wf <-
  fit(mars_p_pre_wf, 
      data = crime_train)

mars_p_predictions <- 
  predict(object = mars_p_pre_wf, 
          new_data = crime_test)$.pred

actual_p <- 
  crime_test$log_property

county <- 
  crime_test$county

mars_p_predictions_df <-
  tibble(county,actual_p,mars_p_predictions)

rmse_mars_p <- 
  rmse(mars_p_predictions_df, actual_p, mars_p_predictions)

XG Boost

feature and target engineering

Code

xgb_p_rec <-
  recipe(log_property ~ .,
         data = crime_train) |>
  update_role(county,
              year,
              viol,
              viol_rate,
              property_rate,
              property,
              log_viol,
              new_role = "id")|>
  step_impute_median(all_predictors())|>
  step_nzv(all_predictors())

all_cores <- 
  parallel::detectCores(logical = FALSE)

registerDoFuture()

cl <- 
  makeCluster(all_cores - 3L)

plan(cluster, workers = cl)

xgb_p_spec <- 
  boost_tree(
  trees = 200,
  tree_depth = tune(), 
  min_n = tune(),
  loss_reduction = tune(),                     
  sample_size = tune(), 
  mtry = tune(),         
  learn_rate = tune()                   
) |>
  set_engine("xgboost") |>
  set_mode("regression")


set.seed(904)
xgb_p_grid <- 
  grid_latin_hypercube(
  tree_depth(),
  min_n(range = c(2, 10)),
  loss_reduction(range = c(-5, -3)),
  sample_size = sample_prop(),
  mtry(range = c(1, 10)),
  learn_rate(range = c(-5, -1)),
  size = 30
)

xgb_p_wf <- workflow() |>
  add_recipe(xgb_p_rec) |>
  add_model(xgb_p_spec)

choose the best model

Code

set.seed(904)
xgb_p_resamples <- 
  tune_grid(
  xgb_p_wf,
  resamples = crime_folds,
  grid = xgb_p_grid)

collect_metrics(xgb_p_resamples)

# A tibble: 60 × 12
    mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
 1     4     8          2  0.000393       0.000173        0.675 rmse   
 2     4     8          2  0.000393       0.000173        0.675 rsq    
 3     7     2          4  0.000566       0.000970        0.829 rmse   
 4     7     2          4  0.000566       0.000970        0.829 rsq    
 5     4     7          9  0.0756         0.0000200       0.111 rmse   
 6     4     7          9  0.0756         0.0000200       0.111 rsq    
 7     7     3         14  0.00179        0.000214        0.413 rmse   
 8     7     3         14  0.00179        0.000214        0.413 rsq    
 9     5     8          6  0.0000187      0.0000519       0.195 rmse   
10     5     8          6  0.0000187      0.0000519       0.195 rsq    
# ℹ 50 more rows
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

Code

show_best(xgb_p_resamples, metric = "rmse")

# A tibble: 5 × 12
   mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
  <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
1     4     4         13     0.0262      0.0000333       0.922 rmse   
2     6     7          6     0.0461      0.0000290       0.547 rmse   
3     2     6          5     0.0559      0.000137        0.998 rmse   
4     2     9          8     0.0306      0.0000118       0.156 rmse   
5     4     7          9     0.0756      0.0000200       0.111 rmse   
# ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

Code

select_best(xgb_p_resamples, metric= "rmse")

# A tibble: 1 × 7
   mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
  <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>            
1     4     4         13     0.0262      0.0000333       0.922 Preprocessor1_Mo…

prediction

Code

set.seed(904)
xgb_p_pre_wf <- 
  xgb_p_wf |> 
  finalize_workflow(select_best(xgb_p_resamples))

xgb_p_pre_wf <-
  fit(xgb_p_pre_wf, 
      data = crime_train)

xgb_p_predictions <- 
  predict(object = xgb_p_pre_wf, 
          new_data = crime_test)$.pred

actual_p <- 
  crime_test$log_property

county <- 
  crime_test$county

xgb_p_predictions_df <-
  tibble(county,actual_p,xgb_p_predictions)

rmse_xgb_p <- 
  rmse(xgb_p_predictions_df, actual_p, xgb_p_predictions)

Final comparison of rmse

Code

property_comparison <-
  bind_rows(rmse_ridge_p, rmse_lasso_p, rmse_rf_p, rmse_mars_p, rmse_xgb_p)

property_comparison|>
  mutate(
    model = c("Ridge Regression","Lasso Regression", 
              "Random Forest", "MARS", "XG Boost"))|>
  select(model, everything())

# A tibble: 5 × 4
  model            .metric .estimator .estimate
  <chr>            <chr>   <chr>          <dbl>
1 Ridge Regression rmse    standard       0.601
2 Lasso Regression rmse    standard       0.607
3 Random Forest    rmse    standard       0.474
4 MARS             rmse    standard       0.510
5 XG Boost         rmse    standard       0.477

Because random forest has the lowest RMSE value, we will use random forest to predict property crime rate in 2016.

Prediction on implementation property crime data in 2016

Code

set.seed(904)
rf_p_pre_wf_2016 <- 
  rf_p_wf |> 
  finalize_workflow(select_best(rf_p_resamples))

rf_p_pre_wf_2016 <-
  fit(rf_p_pre_wf, 
      data = crime_all_2010_cleaned)

rf_p_predictions_2016 <- 
  predict(object = rf_p_pre_wf_2016, 
          new_data = crime_all_2016_cleaned)$.pred

actual_implement_p <- crime_all_2016_cleaned$log_property

county_implement <- crime_all_2016_cleaned$county

rf_p_predictions_2016_df <-
  tibble(county_implement,actual_implement_p,rf_p_predictions_2016)

rmse_rf_p_2016 <- 
  rmse(rf_p_predictions_2016_df, actual_implement_p, rf_p_predictions_2016)

rmse_rf_p_2016

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.459

Final prediction for property crime rate in 2016 has a RMSE of 0.459.