728x90
반응형
german <- read.csv('German_credit.csv')
colnames(german)
[1] "ID" "Checking_account" "Duration_in_month" "Credit_history"
[5] "Purpose" "Credit_amount" "Saving_accout" "Present_employment"
[9] "Installment_rate" "Personal_status___sex" "Other_debtors_guarantors" "Present_residence"
[13] "Property" "Age" "Other_installment_plan" "Housing"
[17] "Num_of_existing_credits" "Job" "Num_of_people_liable" "Telephone"
[21] "Foreign_worker" "Credit_status"
데이터셋은 German_credit을 사용합니다.
해당 데이터셋은 1,000개의 관측치와 22개의 변수들로 구성됩니다.
# 범주형 변수 설정
german$Checking_account <- as.factor(german$Checking_account)
german$Credit_history <- as.factor(german$Credit_history)
german$Purpose <- as.factor(german$Purpose)
german$Saving_accout <- as.factor(german$Saving_accout)
german$Present_employment <- as.factor(german$Present_employment)
german$Personal_status___sex <- as.factor(german$Personal_status___sex)
german$Other_debtors_guarantors <- as.factor(german$Other_debtors_guarantors)
german$Property <- as.factor(german$Property)
german$Other_installment_plan <- as.factor(german$Other_installment_plan)
german$Housing <- as.factor(german$Housing)
german$Job <- as.factor(german$Job)
german$Telephone <- as.factor(german$Telephone)
german$Foreign_worker <- as.factor(german$Foreign_worker)
범주형 변수들을 factor 자료형으로 변경합니다.
# Classification Tree
table(german$Credit_status)
library(sampling)
stratified_sampling <- strata(german, stratanames = c("Credit_status"), size =c(300,300),
method="srswor")
st_data <- getdata(german, stratified_sampling)
table(st_data$Credit_status)
rm(stratified_sampling)
library(caret)
train <- createDataPartition(st_data$ID, p=0.7, list=FALSE)
td <- st_data[train,]
vd <- st_data[-train,]
rm(st_data, train)
colnames(td)
td <- td[, -c(1,23,24,25)]
vd <- vd[, -c(1,23,24,25)]
저번 포스팅과 마찬가지로 층화추출과 데이터 분할을 진행합니다.
# DT ("C50", "rpart", "CHAID")
# C50 - Entropy
install.packages("C50")
library(C50)
tree1 <- C5.0(Credit_status~., data=td, rules=TRUE)
summary(tree1)
treepred <- predict(tree1, vd, type = "class")
confusionMatrix(treepred, vd$Credit_status)
Confusion Matrix and Statistics
Reference
Prediction N Y
N 58 17
Y 32 73
Accuracy : 0.7278
95% CI : (0.6566, 0.7913)
No Information Rate : 0.5
P-Value [Acc > NIR] : 4.009e-10
Kappa : 0.4556
Mcnemar's Test P-Value : 0.0455
Sensitivity : 0.6444
Specificity : 0.8111
Pos Pred Value : 0.7733
Neg Pred Value : 0.6952
Prevalence : 0.5000
Detection Rate : 0.3222
Detection Prevalence : 0.4167
Balanced Accuracy : 0.7278
'Positive' Class : N
Tree 알고리즘은 여러가지가 있습니다.
제가 생각할 때 가장 좋은 알고리즘은 C5.0 입니다.
테스트 정확도는 0.7278 이네요.
# CART - GINI
library(rpart)
tree2 <- rpart(Credit_status~., data=td, method = "class")
summary(tree2)
plot(tree2)
text(tree2, use.n = TRUE, all = TRUE, cex=0.7)
두 번째는 Gini 지수를 불순도로 하는 CART입니다.
treepred <- predict(tree2, vd, type = "class")
confusionMatrix(treepred, vd$Credit_status)
Confusion Matrix and Statistics
Reference
Prediction N Y
N 62 18
Y 28 72
Accuracy : 0.7444
95% CI : (0.6742, 0.8064)
No Information Rate : 0.5
P-Value [Acc > NIR] : 1.8e-11
Kappa : 0.4889
Mcnemar's Test P-Value : 0.1845
Sensitivity : 0.6889
Specificity : 0.8000
Pos Pred Value : 0.7750
Neg Pred Value : 0.7200
Prevalence : 0.5000
Detection Rate : 0.3444
Detection Prevalence : 0.4444
Balanced Accuracy : 0.7444
'Positive' Class : N
테스트 정확도는 0.7444로 C5.0 보다 좋네요.
# CHAID - Chi-square test
# The CHAID package accepts only factor and ordered variable types
install.packages("CHAID", repos="http://R-Forge.R-project.org")
library(CHAID)
colnames(td)
ttd <- td[, -c(2,5,8,11,13,16,18)]
ctrl <- chaid_control(minsplit = 20, minbucket = 5)
tree3 <- chaid(Credit_status~., data=ttd, control = ctrl)
print(tree3)
plot(tree3)
다음은 카이스퀘어 테스트를 진행하는 Chaid 패키지의 의사결정나무 함수입니다.
treepred <- predict(tree3, vd)
confusionMatrix(treepred, vd$Credit_status)
Confusion Matrix and Statistics
Reference
Prediction N Y
N 66 27
Y 24 63
Accuracy : 0.7167
95% CI : (0.6448, 0.7812)
No Information Rate : 0.5
P-Value [Acc > NIR] : 2.766e-09
Kappa : 0.4333
Mcnemar's Test P-Value : 0.7794
Sensitivity : 0.7333
Specificity : 0.7000
Pos Pred Value : 0.7097
Neg Pred Value : 0.7241
Prevalence : 0.5000
Detection Rate : 0.3667
Detection Prevalence : 0.5167
Balanced Accuracy : 0.7167
'Positive' Class : N
Test 정확도는 0.7167입니다.
# Regression Tree
library(rpart)
tree2 <- rpart(Credit_amount~., data=td, method = "anova") # F-test
plot(tree2)
text(tree2, use.n = TRUE, all = TRUE, cex=0.7)
마지막은 Regression Tree입니다.
회귀 문제를 풀 때 사용됩니다.
반응형
'데이터 다루기 > Base of R' 카테고리의 다른 글
[R] 협업 필터링 (Collaborative filtering) (0) | 2020.05.07 |
---|---|
[R] 연관성 분석 (Association rule) (0) | 2020.04.19 |
[R] Logistic Regression (0) | 2020.04.07 |
[R] K-nearest neighbor (KNN) method (0) | 2020.04.07 |
[R] Hierarchical clustering, K-means clustering (0) | 2020.03.16 |