Target: Personal loan
Objetivo: Aumentar el número de clientes prestatarios (clientes de activo)
Convertir los clientes pasivos (depositantes) en prestatarios (conservandolos como depositantes)
rm(list = ls())
graphics.off() # cerrar graficos abiertos
cat("\014")
options(scipen = 999) # Eliminar la notación científica
options(digits = 3) # El número de decimales
# Ubicación de la carpeta
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#getwd()
# Lectura de los datos
library(readxl)
data <- read_excel("Bank_Personal_Loan_Modelling.xlsx",sheet = "Data")
head(data)
## # A tibble: 6 × 14
## ID Age Experience Income `ZIP Code` Family CCAvg Education Mortgage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## # ℹ 5 more variables: `Personal Loan` <dbl>, `Securities Account` <dbl>,
## # `CD Account` <dbl>, Online <dbl>, CreditCard <dbl>
#Eliminamos id por no ser relevante para el análisis
data<-data[-1]
data<-data.frame(data) #Conversión a frame
head(data)
## Age Experience Income ZIP.Code Family CCAvg Education Mortgage Personal.Loan
## 1 25 1 49 91107 4 1.6 1 0 0
## 2 45 19 34 90089 3 1.5 1 0 0
## 3 39 15 11 94720 1 1.0 1 0 0
## 4 35 9 100 94112 1 2.7 2 0 0
## 5 35 8 45 91330 4 1.0 2 0 0
## 6 37 13 29 92121 4 0.4 2 155 0
## Securities.Account CD.Account Online CreditCard
## 1 1 0 0 0
## 2 1 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 1
## 6 0 0 1 0
#Tipos de datos
str(data)
## 'data.frame': 5000 obs. of 13 variables:
## $ Age : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : num 91107 90089 94720 94112 91330 ...
## $ Family : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: num 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : num 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : num 0 0 0 0 1 0 0 1 0 0 ...
#Nominal: ZIP.Code
#Numéricas: Mortgage, CCAvg, Family, Income,Experience,Age
#Ordinales: Education
#Binarias: Personal.Loan(target), Securities.Account,CD.Account
#Online, CreditCard
#Conversión de variables
data$ZIP.Code <- as.factor(data$ZIP.Code)
data$Personal.Loan <- as.factor(data$Personal.Loan)
data$Securities.Account <- as.factor(data$Securities.Account)
data$CD.Account <- as.factor(data$CD.Account)
data$Online <- as.factor(data$Online)
data$CreditCard <- as.factor(data$CreditCard)
#Los demás son numéricos
#Verificamos
str(data)
## 'data.frame': 5000 obs. of 13 variables:
## $ Age : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : Factor w/ 467 levels "9307","90005",..: 84 35 368 299 97 161 116 268 35 236 ...
## $ Family : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ Securities.Account: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 1 2 1 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
#Descripción estadística
summary(data)
## Age Experience Income ZIP.Code Family
## Min. :23.0 Min. :-3.0 Min. : 8.0 94720 : 169 Min. :1.0
## 1st Qu.:35.0 1st Qu.:10.0 1st Qu.: 39.0 94305 : 127 1st Qu.:1.0
## Median :45.0 Median :20.0 Median : 64.0 95616 : 116 Median :2.0
## Mean :45.3 Mean :20.1 Mean : 73.8 90095 : 71 Mean :2.4
## 3rd Qu.:55.0 3rd Qu.:30.0 3rd Qu.: 98.0 93106 : 57 3rd Qu.:3.0
## Max. :67.0 Max. :43.0 Max. :224.0 92037 : 54 Max. :4.0
## (Other):4406
## CCAvg Education Mortgage Personal.Loan Securities.Account
## Min. : 0.00 Min. :1.00 Min. : 0 0:4520 0:4478
## 1st Qu.: 0.70 1st Qu.:1.00 1st Qu.: 0 1: 480 1: 522
## Median : 1.50 Median :2.00 Median : 0
## Mean : 1.94 Mean :1.88 Mean : 56
## 3rd Qu.: 2.50 3rd Qu.:3.00 3rd Qu.:101
## Max. :10.00 Max. :3.00 Max. :635
##
## CD.Account Online CreditCard
## 0:4698 0:2016 0:3530
## 1: 302 1:2984 1:1470
##
##
##
##
##
#Variabilidad
var(data$Age) #Varianza
## [1] 131
sd(data$Age) # Desviación Standar (varia +-11.5 de la media)
## [1] 11.5
var(data$Experience)
## [1] 132
sd(data$Experience)
## [1] 11.5
var(data$Income)
## [1] 2119
sd(data$Income)
## [1] 46
var(data$Family)
## [1] 1.32
sd(data$Family)
## [1] 1.15
var(data$CCAvg)
## [1] 3.05
sd(data$CCAvg)
## [1] 1.75
var(data$Mortgage)
## [1] 10346
sd(data$Mortgage)
## [1] 102
#RIC (Intercuartilico)
summary(data)
## Age Experience Income ZIP.Code Family
## Min. :23.0 Min. :-3.0 Min. : 8.0 94720 : 169 Min. :1.0
## 1st Qu.:35.0 1st Qu.:10.0 1st Qu.: 39.0 94305 : 127 1st Qu.:1.0
## Median :45.0 Median :20.0 Median : 64.0 95616 : 116 Median :2.0
## Mean :45.3 Mean :20.1 Mean : 73.8 90095 : 71 Mean :2.4
## 3rd Qu.:55.0 3rd Qu.:30.0 3rd Qu.: 98.0 93106 : 57 3rd Qu.:3.0
## Max. :67.0 Max. :43.0 Max. :224.0 92037 : 54 Max. :4.0
## (Other):4406
## CCAvg Education Mortgage Personal.Loan Securities.Account
## Min. : 0.00 Min. :1.00 Min. : 0 0:4520 0:4478
## 1st Qu.: 0.70 1st Qu.:1.00 1st Qu.: 0 1: 480 1: 522
## Median : 1.50 Median :2.00 Median : 0
## Mean : 1.94 Mean :1.88 Mean : 56
## 3rd Qu.: 2.50 3rd Qu.:3.00 3rd Qu.:101
## Max. :10.00 Max. :3.00 Max. :635
##
## CD.Account Online CreditCard
## 0:4698 0:2016 0:3530
## 1: 302 1:2984 1:1470
##
##
##
##
##
#Histogramas
hist(data$Age, main=NULL)
hist(data$Experience, main=NULL)
#Se aprecia que hay experiencia negativa lo cual no es conveniente
hist(data$Income, main=NULL)
hist(data$Family, main=NULL)
hist(data$CCAvg, main=NULL)
hist(data$Mortgage, main=NULL)
#Diagrama de Cajas
boxplot(data$Age,main = "Age")
boxplot(data$Experience,main = "Experience")
boxplot(data$Income,main = "Income")
boxplot(data$Family,main = "Family")
boxplot(data$CCAvg,main = "CCAvg")
boxplot(data$Mortgage,main = "Mortgage")
#Se observa outliers en Income, CCAvg y Mortgage
#Barras
barplot(table(data$ZIP.Code),legend.text = "ZIP.Code")
barplot(table(data$Education),legend.text = "Education")
barplot(table(data$Securities.Account),legend.text = "Securities.Account")
barplot(table(data$CD.Account),legend.text = "CD.Account")
barplot(table(data$Online),legend.text = "Online")
barplot(table(data$CreditCard),legend.text = "CreditCard")
barplot(table(data$Personal.Loan),legend.text = "Personal.Loan")
#Se aprecia un gran desbalanceo en el target
#Numérico
plot(data$Age, data$Experience,
main = "Gráfico de Dispersión",
xlab = "Age",
ylab = "Experience",
pch = 19,
col = "Green")
plot(data$Experience, data$Income,
main = "Gráfico de Dispersión",
xlab = "Experience",
ylab = "Income",
pch = 19,
col = "Green")
plot(data$Age, data$Income,
main = "Gráfico de Dispersión",
xlab = "Age",
ylab = "Income",
pch = 19,
col = "Green")
plot(data$CCAvg, data$Income,
main = "Gráfico de Dispersión",
xlab = "CCAvg",
ylab = "Income",
pch = 19,
col = "Green")
plot(data$CCAvg, data$Mortgage,
main = "Gráfico de Dispersión",
xlab = "CCAvg",
ylab = "Mortgage",
pch = 19,
col = "Green")
#Matriz de correlación
data$Personal.Loan <- as.numeric(data$Personal.Loan)
correlation_matrix<-cor(data[, c("Age", "Experience", "Income", "CCAvg", "Education","Mortgage","Personal.Loan")])
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method="number")
#Detectando los valores duplicados
#duplicated(data)
which(duplicated(data))
## integer(0)
#No hay datos duplicados
#Identificación de datos perdidos y duplicados
library(naniar)
vis_miss(data)
#No hay datos faltantes aparentemente
#Sin embargo, del análisis exploratorio se obtiene que
#hay experiencia negativa por lo que se considera como
#dato perdido
data$Experience[data$Experience < 0] <- NA
vis_miss(data)
#Copia para conservar la data original
data2<-data
#Por la media
data2$Experience <- ifelse(is.na(data2$Experience),
mean(data2$Experience, na.rm = TRUE),
data2$Experience)
data_imp_media <- data2
vis_miss(data_imp_media)
vis_miss(data)
#Por KNN
#install.packages(c("xts", "quantmod", "zoo", "ROCR"))
#install.packages("devtools")
#devtools::install_version('DMwR', '0.4.1')
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data_imp_knn <- knnImputation(data) #Reemplazando los datos vacios por el knn a toda la data
vis_miss(data_imp_knn)
#vis_miss(data)
#Se escoge la data imputada por knn por brindar mejores resultados
#Del análisis exploratorio se tiene que
#los outliers están en Income, CCAvg y Mortgage
#Tramiento por la media
tratamiento_outliers <- function(data, column, removeNA = TRUE) {
x <- data[[column]]
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA) #obtiene el P5 y P95 sin considerar los vacios
x[x < quantiles[1]] <- mean(x, na.rm = removeNA) #Reemplaza por la media que no considera los vacios
x[x > quantiles[2]] <- mean(x, na.rm = removeNA)
data[[column]] <- x #obteniendo data con outliers reemplazados
data
}
boxplot.stats(data_imp_knn$Income)
## $stats
## [1] 8 39 64 98 185
##
## $n
## [1] 5000
##
## $conf
## [1] 62.7 65.3
##
## $out
## [1] 193 194 190 188 195 191 200 205 204 195 192 194 202 195 200 193 192 195 191
## [20] 188 191 190 190 194 195 192 190 195 191 192 195 192 193 190 198 201 200 188
## [39] 192 190 194 201 191 191 188 203 189 193 190 204 198 201 201 191 191 195 190
## [58] 188 190 195 195 205 198 190 191 191 195 194 194 202 191 199 203 188 224 188
## [77] 189 191 190 195 193 204 194 195 191 188 195 188 193 199 188 199 194 201 195
## [96] 218
data_tratada <- tratamiento_outliers(data_imp_knn, "Income")
boxplot.stats(data_imp_knn$Income) #antes
## $stats
## [1] 8 39 64 98 185
##
## $n
## [1] 5000
##
## $conf
## [1] 62.7 65.3
##
## $out
## [1] 193 194 190 188 195 191 200 205 204 195 192 194 202 195 200 193 192 195 191
## [20] 188 191 190 190 194 195 192 190 195 191 192 195 192 193 190 198 201 200 188
## [39] 192 190 194 201 191 191 188 203 189 193 190 204 198 201 201 191 191 195 190
## [58] 188 190 195 195 205 198 190 191 191 195 194 194 202 191 199 203 188 224 188
## [77] 189 191 190 195 193 204 194 195 191 188 195 188 193 199 188 199 194 201 195
## [96] 218
boxplot.stats(data_tratada$Income) #despues
## $stats
## [1] 18 42 70 89 159
##
## $n
## [1] 5000
##
## $conf
## [1] 68.9 71.1
##
## $out
## [1] 161 163 168 169 165 170 170 165 161 160 164 161 170 165 165 164 164 165 160
## [20] 164 162 161 161 169 168 165 165 169 164 162 160 160 170 161 161 163 170 169
## [39] 160 161 160 168 162 160 162 168 170 170 160 169 163 161 164 168 164 161 163
## [58] 161 160 163 163 168 160 162 161 163 164 169 168 164 163 170 162 170 169 165
## [77] 162 164 160 164 170 161 162 168 163 161 161 165 170 164 164 161 162 170 160
## [96] 165 165 162
boxplot.stats(data_tratada$CCAvg)
## $stats
## [1] 0.0 0.7 1.5 2.5 5.2
##
## $n
## [1] 5000
##
## $conf
## [1] 1.46 1.54
##
## $out
## [1] 8.90 8.10 5.70 8.00 5.70 5.60 7.20 7.40 7.50 6.50 6.50 7.80
## [13] 7.90 6.80 7.40 7.50 7.90 6.20 5.50 6.90 7.50 7.30 6.10 6.33
## [25] 6.60 5.30 7.50 6.80 7.00 6.60 6.30 7.50 5.70 8.30 5.50 6.90
## [37] 6.10 6.00 8.00 6.80 6.33 7.80 7.20 6.50 6.80 6.00 7.20 8.60
## [49] 6.90 6.10 7.80 6.00 7.60 7.40 6.10 7.00 8.10 6.90 6.40 6.00
## [61] 7.60 6.30 7.60 10.00 6.00 5.90 8.10 5.40 8.80 5.40 6.33 8.10
## [73] 8.80 5.70 7.60 7.30 7.00 5.70 6.10 6.90 6.80 5.60 7.00 6.50
## [85] 7.40 7.80 8.00 7.00 8.00 6.80 6.30 6.30 8.80 8.10 6.33 5.40
## [97] 6.90 9.00 6.00 8.60 5.90 5.40 7.40 6.33 6.80 5.40 7.30 7.40
## [109] 6.70 6.90 7.00 6.00 7.00 6.70 7.40 6.30 6.30 6.00 6.00 7.60
## [121] 6.67 6.33 6.50 6.50 7.40 7.20 5.30 5.70 8.60 8.30 5.80 7.80
## [133] 6.00 5.40 7.40 8.10 6.67 6.00 6.33 6.00 6.50 5.67 7.30 8.50
## [145] 8.00 6.70 7.30 8.00 6.90 10.00 6.50 6.50 5.67 7.30 8.00 5.40
## [157] 5.70 6.90 7.00 6.70 6.80 5.90 10.00 7.50 5.60 6.67 6.10 7.50
## [169] 8.00 6.10 6.70 8.80 7.40 7.30 6.70 6.50 6.10 6.00 6.90 6.30
## [181] 7.80 6.67 6.00 6.10 7.40 7.90 7.50 5.40 5.70 5.40 7.20 5.40
## [193] 8.00 6.10 5.70 5.40 7.20 8.80 7.00 6.50 7.90 6.30 6.90 7.60
## [205] 6.00 5.80 8.20 5.70 7.30 6.67 8.10 7.50 7.80 5.90 6.70 7.00
## [217] 6.50 8.60 8.80 7.00 6.10 5.50 5.70 5.40 6.90 8.00 7.50 6.10
## [229] 5.40 8.80 7.80 6.00 5.33 7.20 8.60 6.70 8.00 7.40 6.90 6.40
## [241] 6.00 5.40 7.00 6.50 5.60 5.60 5.80 5.40 6.00 6.30 8.10 6.90
## [253] 6.30 8.10 5.70 6.00 5.40 6.67 6.80 8.80 7.80 6.50 9.00 7.20
## [265] 6.67 6.00 9.30 5.60 7.50 7.60 5.50 6.50 6.30 6.70 7.20 6.50
## [277] 5.90 6.80 8.60 8.80 7.60 7.60 7.20 5.70 6.33 7.30 6.33 7.20
## [289] 7.40 6.00 6.10 6.40 6.00 8.10 5.60 6.30 7.30 8.50 5.30 6.00
## [301] 5.40 6.50 6.30 8.60 6.67 6.20 6.60 8.00 6.50 7.00 6.33 7.20
## [313] 6.00 6.60 6.00 7.20 7.00 6.00 5.40 6.10 7.50 8.60 5.30 6.67
data_tratada <- tratamiento_outliers(data_tratada, "CCAvg")
boxplot.stats(data_tratada$CCAvg)
## $stats
## [1] 0.10 0.70 1.60 2.30 4.67
##
## $n
## [1] 5000
##
## $conf
## [1] 1.56 1.64
##
## $out
## [1] 4.70 5.00 5.70 5.70 5.00 5.60 5.20 5.00 5.10 4.90 5.50 5.30 4.90 5.70 5.50
## [16] 6.00 4.90 4.90 6.00 5.10 6.00 4.70 6.00 6.00 5.90 5.40 5.00 5.40 5.70 4.70
## [31] 5.20 5.20 4.70 4.90 5.70 5.60 5.20 5.00 5.10 4.90 5.40 6.00 4.90 5.20 5.90
## [46] 5.40 5.40 4.90 6.00 6.00 5.20 6.00 4.70 4.90 5.00 5.30 5.70 4.70 5.10 5.80
## [61] 6.00 4.90 5.40 4.80 6.00 4.80 4.90 6.00 4.70 5.67 4.80 4.90 5.67 5.20 4.80
## [76] 4.75 5.40 5.70 5.00 5.00 4.70 5.90 4.90 5.60 4.80 5.20 4.90 6.00 5.00 6.00
## [91] 5.00 5.40 5.70 4.90 5.40 4.90 5.40 5.70 5.40 4.75 6.00 5.00 5.80 5.70 5.20
## [106] 5.90 5.20 5.20 5.00 4.70 4.90 4.70 5.50 5.70 5.00 4.70 5.40 5.40 4.70 5.20
## [121] 5.10 5.00 6.00 5.33 4.90 4.90 4.80 4.70 4.70 6.00 5.40 5.60 5.60 4.70 4.70
## [136] 5.80 5.40 6.00 5.70 5.10 5.20 4.70 6.00 5.40 5.00 5.20 6.00 5.60 5.00 5.20
## [151] 5.50 5.20 5.90 4.90 5.70 6.00 6.00 5.60 4.70 5.30 6.00 5.40 5.00 4.70 5.00
## [166] 6.00 4.90 4.70 6.00 4.70 4.70 4.80 4.70 6.00 5.40 4.70 4.90 5.30
boxplot.stats(data_tratada$Mortgage)
## $stats
## [1] 0 0 0 101 252
##
## $n
## [1] 5000
##
## $conf
## [1] -2.26 2.26
##
## $out
## [1] 260 285 412 455 336 309 366 276 315 282 280 264 325 391 617 402 360 392
## [19] 419 270 466 290 458 547 470 304 271 378 314 485 300 272 275 327 322 282
## [37] 364 449 355 314 587 307 263 310 265 305 372 301 289 305 303 256 259 524
## [55] 287 333 357 361 301 366 294 329 442 394 327 475 297 437 428 333 366 257
## [73] 337 382 397 380 297 357 433 483 305 294 287 277 268 354 256 285 318 342
## [91] 266 455 341 421 359 565 319 394 267 601 567 352 284 256 334 268 389 342
## [109] 372 275 589 277 397 359 323 380 329 535 293 398 343 307 272 255 294 311
## [127] 446 262 266 323 319 422 315 289 310 299 428 505 309 400 301 267 422 307
## [145] 257 326 341 298 297 569 374 310 408 352 406 452 432 312 477 396 582 358
## [163] 380 467 331 303 565 295 282 264 327 262 635 352 385 437 328 522 301 276
## [181] 496 415 392 461 344 263 297 368 257 325 256 321 255 296 373 325 329 268
## [199] 292 383 280 358 354 275 408 442 315 427 271 364 429 431 286 508 272 416
## [217] 553 368 403 260 500 313 410 285 273 304 341 449 333 259 277 381 270 402
## [235] 292 400 330 345 294 428 253 255 258 351 427 312 294 353 322 308 278 342
## [253] 464 509 481 281 308 306 577 319 272 330 272 422 301 302 256 328 405 270
## [271] 264 571 307 293 581 550 328 283 400 307 323 263 612 590 313 260 541 342
## [289] 299 308 306
data_tratada <- tratamiento_outliers(data_tratada, "Mortgage")
boxplot.stats(data_tratada$Mortgage)
## $stats
## [1] 0 0 0 79 197
##
## $n
## [1] 5000
##
## $conf
## [1] -1.77 1.77
##
## $out
## [1] 260 198 211 207 240 236 198 251 244 209 249 264 251 248 270 220 224 221
## [19] 211 224 271 203 230 272 240 208 211 218 205 227 239 207 207 220 205 214
## [37] 263 252 265 232 212 250 256 203 259 204 249 236 231 205 247 214 229 229
## [55] 233 215 227 228 241 204 236 236 234 257 219 224 215 200 222 230 212 201
## [73] 268 237 221 256 198 238 244 226 227 266 245 249 233 230 221 229 267 231
## [91] 221 199 256 240 268 218 241 198 246 245 218 231 203 242 218 236 207 222
## [109] 219 208 272 229 232 247 255 219 207 223 262 266 242 217 239 251 217 203
## [127] 219 267 218 208 230 257 212 229 234 207 245 216 214 204 248 212 226 204
## [145] 249 220 247 240 205 213 200 201 230 204 240 219 212 264 252 238 235 262
## [163] 244 228 263 221 206 221 257 204 209 256 218 255 240 228 227 199 239 223
## [181] 268 199 209 213 238 221 244 271 202 251 231 211 229 210 272 209 224 225
## [199] 242 260 205 232 229 219 204 221 205 259 230 270 215 251 216 253 255 200
## [217] 258 246 209 220 199 216 241 248 223 202 227 251 217 243 205 272 207 245
## [235] 220 198 272 205 256 241 270 249 264 226 218 200 221 247 207 236 232 263
## [253] 249 225 230 217 217 218 260 243 249 250 213 219
library(dlookr)
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
library(flextable)
#Cantidad de outlieres antes y despues de tratamiento
diagnose_outlier(data_imp_knn)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 Age 0 0.00 NaN 45.34
## 2 Experience 0 0.00 NaN 20.17
## 3 Income 96 1.92 194.67 73.77
## 4 Family 0 0.00 NaN 2.40
## 5 CCAvg 324 6.48 6.85 1.94
## 6 Education 0 0.00 NaN 1.88
## 7 Mortgage 291 5.82 355.66 56.50
## 8 Personal.Loan 480 9.60 2.00 1.10
## without_mean
## 1 45.34
## 2 20.17
## 3 71.41
## 4 2.40
## 5 1.60
## 6 1.88
## 7 38.01
## 8 1.00
diagnose_outlier(data_tratada)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 Age 0 0.00 NaN 45.34
## 2 Experience 0 0.00 NaN 20.17
## 3 Income 98 1.96 164.26 71.18
## 4 Family 0 0.00 NaN 2.40
## 5 CCAvg 178 3.56 5.29 1.73
## 6 Education 0 0.00 NaN 1.88
## 7 Mortgage 264 5.28 230.46 40.91
## 8 Personal.Loan 480 9.60 2.00 1.10
## without_mean
## 1 45.34
## 2 20.17
## 3 69.32
## 4 2.40
## 5 1.59
## 6 1.88
## 7 30.34
## 8 1.00
par(mfrow = c(1,2))
boxplot(data_imp_knn$Mortgage, main = "sin tratar",
col = 3) #color de la caja
boxplot(data_tratada$Mortgage, main = "tratada",col=2)
library(caret)
## Loading required package: ggplot2
nearZeroVar(data_tratada, saveMetrics = TRUE)
## freqRatio percentUnique zeroVar nzv
## Age 1.01 0.90 FALSE FALSE
## Experience 1.04 1.92 FALSE FALSE
## Income 1.11 2.50 FALSE FALSE
## ZIP.Code 1.33 9.34 FALSE FALSE
## Family 1.14 0.08 FALSE FALSE
## CCAvg 1.00 1.58 FALSE FALSE
## Education 1.40 0.06 FALSE FALSE
## Mortgage 14.02 3.94 FALSE FALSE
## Personal.Loan 9.42 0.04 FALSE FALSE
## Securities.Account 8.58 0.04 FALSE FALSE
## CD.Account 15.56 0.04 FALSE FALSE
## Online 1.48 0.04 FALSE FALSE
## CreditCard 2.40 0.04 FALSE FALSE
#Variables con correlacion mayor a 0.8
library(caret)
altaCorr <- findCorrelation(correlation_matrix,
cutoff = 0.80, #Para mostrar las correlaciones mayores a 0.8
names = TRUE,
verbose = TRUE,
exact = F)
##
## Combination row 1 and column 2 is above the cut-off, value = 0.994
## Flagging column 1
altaCorr
## [1] "Age"
#Se observa que Age tiene alta correlación con otra variable de la data
#Podria ser eliminada del analisis mas adelante por ser irrelevante
#Para cuantitativos
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
numeric_cols <- c("Mortgage","CCAvg","Family","Income","Experience","Age")
data_Normalizada <- data_tratada #Copia de la data tratada
# Normalizar las columnas numéricas
data_Normalizada[numeric_cols] <- lapply(data_tratada[numeric_cols], normalize)
head(data_Normalizada)
## Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1 0.0455 0.0233 0.2039 91107 1.000 0.2542 1 0.00
## 2 0.5000 0.4419 0.1053 90089 0.667 0.2373 1 0.00
## 3 0.3636 0.3488 0.3669 94720 0.000 0.1525 1 0.00
## 4 0.2727 0.2093 0.5395 94112 0.000 0.4407 2 0.00
## 5 0.2727 0.1860 0.1776 91330 1.000 0.1525 2 0.00
## 6 0.3182 0.3023 0.0724 92121 1.000 0.0508 2 0.57
## Personal.Loan Securities.Account CD.Account Online CreditCard
## 1 1 1 0 0 0
## 2 1 1 0 0 0
## 3 1 0 0 0 0
## 4 1 0 0 0 0
## 5 1 0 0 0 1
## 6 1 0 0 1 0
#head(data_tratada)
data_seleccionada <-data_Normalizada #copia de data normalizada
head(data_seleccionada)
## Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1 0.0455 0.0233 0.2039 91107 1.000 0.2542 1 0.00
## 2 0.5000 0.4419 0.1053 90089 0.667 0.2373 1 0.00
## 3 0.3636 0.3488 0.3669 94720 0.000 0.1525 1 0.00
## 4 0.2727 0.2093 0.5395 94112 0.000 0.4407 2 0.00
## 5 0.2727 0.1860 0.1776 91330 1.000 0.1525 2 0.00
## 6 0.3182 0.3023 0.0724 92121 1.000 0.0508 2 0.57
## Personal.Loan Securities.Account CD.Account Online CreditCard
## 1 1 1 0 0 0
## 2 1 1 0 0 0
## 3 1 0 0 0 0
## 4 1 0 0 0 0
## 5 1 0 0 0 1
## 6 1 0 0 1 0
library(Boruta)
#install.packages("Boruta")
set.seed(123)
boruta_output <- Boruta(data_seleccionada$Personal.Loan ~ ., data = data_seleccionada, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## After 11 iterations, +6.4 secs:
## confirmed 9 attributes: Age, CCAvg, CD.Account, CreditCard, Education and 4 more;
## still have 4 attributes left.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## After 15 iterations, +8.7 secs:
## confirmed 1 attribute: Mortgage;
## still have 3 attributes left.
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## After 25 iterations, +14 secs:
## rejected 1 attribute: ZIP.Code;
## still have 2 attributes left.
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## After 30 iterations, +17 secs:
## confirmed 1 attribute: Online;
## still have 1 attribute left.
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## After 49 iterations, +28 secs:
## confirmed 1 attribute: Securities.Account;
## no more attributes left.
# Resultados de Boruta
print(boruta_output)
## Boruta performed 49 iterations in 28.3 secs.
## 12 attributes confirmed important: Age, CCAvg, CD.Account, CreditCard,
## Education and 7 more;
## 1 attributes confirmed unimportant: ZIP.Code;
# Variables importantes
final_vars <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(final_vars)
## [1] "Age" "Experience" "Income"
## [4] "Family" "CCAvg" "Education"
## [7] "Mortgage" "Personal.Loan" "Securities.Account"
## [10] "CD.Account" "Online" "CreditCard"
#Se observa que ZIP.Code no es relevante
data_seleccionada$ZIP.Code <- NULL #no importante
head(data_seleccionada)
## Age Experience Income Family CCAvg Education Mortgage Personal.Loan
## 1 0.0455 0.0233 0.2039 1.000 0.2542 1 0.00 1
## 2 0.5000 0.4419 0.1053 0.667 0.2373 1 0.00 1
## 3 0.3636 0.3488 0.3669 0.000 0.1525 1 0.00 1
## 4 0.2727 0.2093 0.5395 0.000 0.4407 2 0.00 1
## 5 0.2727 0.1860 0.1776 1.000 0.1525 2 0.00 1
## 6 0.3182 0.3023 0.0724 1.000 0.0508 2 0.57 1
## Securities.Account CD.Account Online CreditCard
## 1 1 0 0 0
## 2 1 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 1
## 6 0 0 1 0
data_balanceada <-data_seleccionada #copia de data seleccionada
str(data_balanceada)
## 'data.frame': 5000 obs. of 12 variables:
## $ Age : num 0.0455 0.5 0.3636 0.2727 0.2727 ...
## $ Experience : num 0.0233 0.4419 0.3488 0.2093 0.186 ...
## $ Income : num 0.204 0.105 0.367 0.539 0.178 ...
## $ Family : num 1 0.667 0 0 1 ...
## $ CCAvg : num 0.254 0.237 0.153 0.441 0.153 ...
## $ Education : num 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num 0 0 0 0 0 ...
## $ Personal.Loan : num 1 1 1 1 1 1 1 1 1 2 ...
## $ Securities.Account: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 1 2 1 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
dim(data_balanceada)
## [1] 5000 12
data_balanceada$Personal.Loan <- as.factor(data_balanceada$Personal.Loan)
#Clases del target
contrasts(data_balanceada$Personal.Loan)
## 2
## 1 0
## 2 1
table(data_balanceada$Personal.Loan) #Ver distribucion de las clases del target
##
## 1 2
## 4520 480
# Proporción de variable dependiente
round(prop.table(table(data_balanceada$Personal.Loan)), 3) * 100
##
## 1 2
## 90.4 9.6
#Se observa que la data esta desbalanceada
# Selección de muestra de entrenamiento (80%) y de evaluación (20%)
library(caret)
set.seed(123)
index <- createDataPartition(data_balanceada$Personal.Loan,
p = 0.8, list = FALSE)
data_train <- data_balanceada[ index, ] # Solo se balancea la data train
data_testing <- data_balanceada[-index, ]
# Proporción Base Original
addmargins(table(data_seleccionada$Personal.Loan))
##
## 1 2 Sum
## 4520 480 5000
round(prop.table(table(data_seleccionada$Personal.Loan)) * 100, 2)
##
## 1 2
## 90.4 9.6
# Proporción Base train
addmargins(table(data_train$Personal.Loan))
##
## 1 2 Sum
## 3616 384 4000
round(prop.table(table(data_train$Personal.Loan)) * 100, 2)
##
## 1 2
## 90.4 9.6
# Proporción Base test
addmargins(table(data_testing$Personal.Loan))
##
## 1 2 Sum
## 904 96 1000
round(prop.table(table(data_testing$Personal.Loan)) * 100, 2)
##
## 1 2
## 90.4 9.6
#Balanceo con SMOTE
library(performanceEstimation)
set.seed(123)
smote_train <- smote(Personal.Loan ~ .,
data = data_train,
perc.over = 2, # SMOTE (clase minoritaria)
perc.under = 1.5) # Undersampling (clase mayoritaria)
addmargins(table(data_train$Personal.Loan))
##
## 1 2 Sum
## 3616 384 4000
addmargins(table(smote_train$Personal.Loan))
##
## 1 2 Sum
## 1152 1152 2304
library(caret)
ctrl <- trainControl(method = "cv", number = 10)
#GLM-regresion
contrasts(smote_train$Personal.Loan)
## 2
## 1 0
## 2 1
#1 = clasifica 0 (No)
#2 = clasifica 1 (SI)
set.seed(123)
modelo_glm <- train(Personal.Loan ~ .,
data = smote_train,
method = "glm", family = "binomial",
trControl = ctrl,
metric = "Accuracy")
modelo_glm
## Generalized Linear Model
##
## 2304 samples
## 11 predictor
## 2 classes: '1', '2'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2073, 2074, 2073, 2074, 2074, 2073, ...
## Resampling results:
##
## Accuracy Kappa
## 0.868 0.737
proba.modelo_glm <- predict(modelo_glm,
newdata = data_testing,
type = "prob")
predict.clase_glm <- predict(modelo_glm,
newdata = data_testing)
result_glm <- caret::confusionMatrix(predict.clase_glm,
data_testing$Personal.Loan,
positive = "2")
result_glm #Indicadores
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 789 19
## 2 115 77
##
## Accuracy : 0.866
## 95% CI : (0.843, 0.887)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.466
##
## Mcnemar's Test P-Value : 0.000000000000000227
##
## Sensitivity : 0.802
## Specificity : 0.873
## Pos Pred Value : 0.401
## Neg Pred Value : 0.976
## Prevalence : 0.096
## Detection Rate : 0.077
## Detection Prevalence : 0.192
## Balanced Accuracy : 0.837
##
## 'Positive' Class : 2
##
#knn
set.seed(123)
modelo_knn <- train(Personal.Loan ~ .,
data = smote_train,
method = "knn",
trControl = ctrl,
tuneGrid = expand.grid(k = seq(1, 81, 2)), #Turnea 40 valores al azar
metric="Accuracy")
predict.clase_KNN <- predict(modelo_knn, newdata = data_testing)
result_knn <- caret::confusionMatrix(predict.clase_KNN,
data_testing$Personal.Loan,
positive = "2")
result_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 836 19
## 2 68 77
##
## Accuracy : 0.913
## 95% CI : (0.894, 0.93)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 0.181
##
## Kappa : 0.592
##
## Mcnemar's Test P-Value : 0.000000266
##
## Sensitivity : 0.802
## Specificity : 0.925
## Pos Pred Value : 0.531
## Neg Pred Value : 0.978
## Prevalence : 0.096
## Detection Rate : 0.077
## Detection Prevalence : 0.145
## Balanced Accuracy : 0.863
##
## 'Positive' Class : 2
##
#SVM
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:dlookr':
##
## kurtosis, skewness
set.seed(123)
modelo_svm <- train(Personal.Loan ~ .,
data = smote_train,
method = "svmRadial",
trControl = ctrl,
metric = "Accuracy")
predict.clase_svm <- predict(modelo_svm, newdata = data_testing)
result_svm <- confusionMatrix(predict.clase_svm, data_testing$Personal.Loan, positive = "2")
result_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 833 15
## 2 71 81
##
## Accuracy : 0.914
## 95% CI : (0.895, 0.931)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 0.154
##
## Kappa : 0.607
##
## Mcnemar's Test P-Value : 0.00000000301
##
## Sensitivity : 0.844
## Specificity : 0.921
## Pos Pred Value : 0.533
## Neg Pred Value : 0.982
## Prevalence : 0.096
## Detection Rate : 0.081
## Detection Prevalence : 0.152
## Balanced Accuracy : 0.883
##
## 'Positive' Class : 2
##
#MLP - perceptron
set.seed(123)
library("nnet")
modelo_mlp <- nnet(Personal.Loan ~ .,
data = smote_train,
size = 10,
trControl = ctrl,
maxit = 100)
## # weights: 131
## initial value 1660.926523
## iter 10 value 679.041188
## iter 20 value 458.245140
## iter 30 value 405.158899
## iter 40 value 336.827053
## iter 50 value 294.042918
## iter 60 value 274.621363
## iter 70 value 261.776680
## iter 80 value 243.395071
## iter 90 value 230.475425
## iter 100 value 219.414472
## final value 219.414472
## stopped after 100 iterations
# Realizar predicciones con el modelo MLP
proba.modelo_mlp <- predict(modelo_mlp, newdata = data_testing, type = "raw")
contrasts(smote_train$Personal.Loan)
## 2
## 1 0
## 2 1
# Convertir las probabilidades en clases
predict.clase_mlp <- as.factor(ifelse(proba.modelo_mlp > 0.5, "2", "1"))
# Calcular la matriz de confusión
result_mlp <- confusionMatrix(predict.clase_mlp, data_testing$Personal.Loan, positive = "2")
result_mlp
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 813 18
## 2 91 78
##
## Accuracy : 0.891
## 95% CI : (0.87, 0.91)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 0.924
##
## Kappa : 0.531
##
## Mcnemar's Test P-Value : 0.00000000000534
##
## Sensitivity : 0.812
## Specificity : 0.899
## Pos Pred Value : 0.462
## Neg Pred Value : 0.978
## Prevalence : 0.096
## Detection Rate : 0.078
## Detection Prevalence : 0.169
## Balanced Accuracy : 0.856
##
## 'Positive' Class : 2
##
#C50
set.seed(123)
modelo_c50 <- train(Personal.Loan ~ .,
data = smote_train,
method = "C5.0",
trControl = ctrl,
metric = "Accuracy")
varImp(modelo_c50)
## C5.0 variable importance
##
## Overall
## Income 100.0
## CCAvg 100.0
## CD.Account1 100.0
## CreditCard1 91.2
## Experience 90.2
## Securities.Account1 79.5
## Online1 79.5
## Education 79.3
## Family 75.1
## Age 0.0
## Mortgage 0.0
# Realizar predicciones con el modelo C5.0
proba.modelo_c50 <- predict(modelo_c50, newdata = data_testing, type = "prob")
predict.clase_c50 <- predict(modelo_c50, newdata = data_testing)
# Matriz de confusión
result_c50 <- confusionMatrix(predict.clase_c50, data_testing$Personal.Loan, positive = "2")
result_c50
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 880 3
## 2 24 93
##
## Accuracy : 0.973
## 95% CI : (0.961, 0.982)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : < 0.0000000000000002
##
## Kappa : 0.858
##
## Mcnemar's Test P-Value : 0.000119
##
## Sensitivity : 0.969
## Specificity : 0.973
## Pos Pred Value : 0.795
## Neg Pred Value : 0.997
## Prevalence : 0.096
## Detection Rate : 0.093
## Detection Prevalence : 0.117
## Balanced Accuracy : 0.971
##
## 'Positive' Class : 2
##
#ENSAMBLE
library(adabag)
## Loading required package: rpart
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(xgboost)
#Bagging (randomForest)
# Configuramos el modelo Random Forest con 100 árboles, validación cruzada y la métrica ROC.
set.seed(123)
modelo.rf <- train(
Personal.Loan ~ .,
data = smote_train, # Conjunto de datos de entrenamiento
method = "rf", # Método de modelado (Random Forest)
ntree = 100, # Número de árboles en el bosque
trControl = ctrl, # Control de entrenamiento (validación cruzada)
#metric = "ROC" # Métrica para evaluar el modelo (ROC)
)
predict.clase_rf <- predict(modelo.rf, data_testing)
#MODELOS Boosting
#Adaboost
library(adabag)
#Configuramos el modelo AdaBoost para hacer boosting con 50 iteraciones y rpart como modelo base.
set.seed(123)
modelo.adaboost <- boosting(
Personal.Loan ~ .,
data = smote_train, # Conjunto de datos de entrenamiento
boos = TRUE, # Habilitar el boosting
mfinal = 3, # Número de iteraciones
B = 50, # Número de iteraciones de boosting
verbose = TRUE, # Mostrar mensajes detallados durante el entrenamiento
metric = "ROC" # Métrica para evaluar el modelo (ROC)
)
predict.clase_adaboost <- predict(modelo.adaboost, data_testing)$class
predict.clase_adaboost <- as.factor(predict.clase_adaboost)
#GBM
# Modelo GBM (Gradient Boosting Machine)
# Configuramos el modelo GBM con validación cruzada, sin mensajes detallados y la métrica ROC.
modelo.gbm <- train(
Personal.Loan ~ .,
data = smote_train, # Conjunto de datos de entrenamiento
method = "gbm", # Método de modelado (Gradient Boosting Machine)
trControl = ctrl, # Control de entrenamiento (validación cruzada)
verbose = FALSE, # No mostrar mensajes detallados durante el entrenamiento
#metric = "ROC" # Métrica para evaluar el modelo (ROC)
)
predict.clase_gbm <- predict(modelo.gbm, data_testing)
# Verificar la correlación entre las predicciones (qué tan similares son)
pred_df <- data.frame(predict.clase_glm,predict.clase_KNN,predict.clase_svm,predict.clase_mlp,predict.clase_c50,predict.clase_rf,predict.clase_adaboost,predict.clase_gbm)
cor_matrix <- cor(sapply(pred_df, as.numeric)) # Calcular la matriz de correlación
cat("Matriz de Correlación de las Predicciones:\n")
## Matriz de Correlación de las Predicciones:
print(cor_matrix)
## predict.clase_glm predict.clase_KNN predict.clase_svm
## predict.clase_glm 1.000 0.470 0.656
## predict.clase_KNN 0.470 1.000 0.633
## predict.clase_svm 0.656 0.633 1.000
## predict.clase_mlp 0.559 0.587 0.746
## predict.clase_c50 0.541 0.601 0.678
## predict.clase_rf 0.548 0.589 0.735
## predict.clase_adaboost 0.538 0.548 0.730
## predict.clase_gbm 0.593 0.598 0.768
## predict.clase_mlp predict.clase_c50 predict.clase_rf
## predict.clase_glm 0.559 0.541 0.548
## predict.clase_KNN 0.587 0.601 0.589
## predict.clase_svm 0.746 0.678 0.735
## predict.clase_mlp 1.000 0.641 0.655
## predict.clase_c50 0.641 1.000 0.891
## predict.clase_rf 0.655 0.891 1.000
## predict.clase_adaboost 0.613 0.784 0.844
## predict.clase_gbm 0.655 0.864 0.877
## predict.clase_adaboost predict.clase_gbm
## predict.clase_glm 0.538 0.593
## predict.clase_KNN 0.548 0.598
## predict.clase_svm 0.730 0.768
## predict.clase_mlp 0.613 0.655
## predict.clase_c50 0.784 0.864
## predict.clase_rf 0.844 0.877
## predict.clase_adaboost 1.000 0.869
## predict.clase_gbm 0.869 1.000
#GBM y RF tienen alta correlación con otros modelos, se quitan del stacking
# Ensamble de los modelos
ensamble <- data.frame(predict.clase_glm,predict.clase_KNN,predict.clase_svm,predict.clase_mlp,predict.clase_c50,predict.clase_adaboost)
ensamble$Cant.yes <- rowSums(ensamble[, 1:6] == 2) # Contar cuántos modelos dicen "Sí"
ensamble$Cant.no <- rowSums(ensamble[, 1:6] == 1) # Contar cuántos modelos dicen "No"
ensamble$predict <- ifelse(ensamble$Cant.yes > ensamble$Cant.no, 2, 1) # Decidir "Sí" o "No" basado en la mayoría
predict.clase_ensamble<-ensamble$predict
predict.clase_ensamble <- as.factor(predict.clase_ensamble)
#str(predict.clase_ensamble)
#names(ensamble)
#head(ensamble)
confusionMatrix <- table(Predicted = ensamble$predict, Actual = data_testing$Personal.Loan) # Crear matriz de confusión
confusionMatrix
## Actual
## Predicted 1 2
## 1 865 14
## 2 39 82
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
accuracy
## [1] 0.947
sensitivity_ensamble <- sensitivity(confusionMatrix) # Calcular la sensibilidad
specificity_ensamble <- specificity(confusionMatrix) # Calcular la especificidad
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
binary_actual <- ifelse(data_testing$Personal.Loan == "2", 1, 0)
ensamble$predict1 <- ifelse(ensamble$predict == "2", 1, 0)
roc_ensamble <- roc(binary_actual, ensamble$predict1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_ensamble <- auc(roc_ensamble)
auc_ensamble
## Area under the curve: 0.906
library(irr) # Para calcular Kappa manualmente
## Loading required package: lpSolve
# Calcular las métricas para cada modelo individual
results <- data.frame(
Model = c("GLM", "KNN", "SVM", "MLP", "C50", "RF","AdaBoost","Stacking"),
Accuracy = c(
confusionMatrix(predict.clase_glm, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_KNN, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_svm, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_mlp, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_c50, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_rf, data_testing$Personal.Loan)$overall['Accuracy'],
confusionMatrix(predict.clase_adaboost, data_testing$Personal.Loan)$overall['Accuracy'],
accuracy
),
Sensitivity = c(
confusionMatrix(predict.clase_glm, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_KNN, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_svm, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_mlp, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_c50, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_rf, data_testing$Personal.Loan)$byClass['Sensitivity'],
confusionMatrix(predict.clase_adaboost, data_testing$Personal.Loan)$byClass['Sensitivity'],
sensitivity_ensamble
),
Specificity = c(
confusionMatrix(predict.clase_glm, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_KNN, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_svm, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_mlp, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_c50, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_rf, data_testing$Personal.Loan)$byClass['Specificity'],
confusionMatrix(predict.clase_adaboost, data_testing$Personal.Loan)$byClass['Specificity'],
specificity_ensamble
),
Kappa = c(
confusionMatrix(predict.clase_glm, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_KNN, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_svm, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_mlp, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_c50, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_rf, data_testing$Personal.Loan)$overall['Kappa'],
confusionMatrix(predict.clase_adaboost, data_testing$Personal.Loan)$overall['Kappa'],
kappa2(data.frame(data_testing$Personal.Loan, ensamble$predict))$value
),
AUC = c(
auc(roc(binary_actual, ifelse(predict.clase_glm == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_KNN == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_svm == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_mlp == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_c50 == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_rf == 2, 1, 0))),
auc(roc(binary_actual, ifelse(predict.clase_adaboost == 2, 1, 0))),
auc_ensamble
)
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Imprimir el cuadro comparativo con formato condicional
print(results)
## Model Accuracy Sensitivity Specificity Kappa AUC
## 1 GLM 0.866 0.873 0.802 0.466 0.837
## 2 KNN 0.913 0.925 0.802 0.592 0.863
## 3 SVM 0.914 0.921 0.844 0.607 0.883
## 4 MLP 0.891 0.899 0.812 0.531 0.856
## 5 C50 0.973 0.973 0.969 0.858 0.971
## 6 RF 0.961 0.959 0.979 0.807 0.969
## 7 AdaBoost 0.936 0.933 0.969 0.710 0.951
## 8 Stacking 0.947 0.957 0.854 0.726 0.906
#dim(data_balanceada)
#dim(smote_train)
Se concluye que el mejor modelo para predecir si un cliente aceptará el prestamo personal ofrecido en la
última campaña es el C50 porque demuestra mejores indicadores en su mayoría.