본문 바로가기

데이터 다루기/Base of R

[R] Decision Tree (의사결정나무)

728x90
반응형
german <- read.csv('German_credit.csv')
colnames(german)

 [1] "ID"                       "Checking_account"         "Duration_in_month"        "Credit_history"          
 [5] "Purpose"                  "Credit_amount"            "Saving_accout"            "Present_employment"      
 [9] "Installment_rate"         "Personal_status___sex"    "Other_debtors_guarantors" "Present_residence"       
[13] "Property"                 "Age"                      "Other_installment_plan"   "Housing"                 
[17] "Num_of_existing_credits"  "Job"                      "Num_of_people_liable"     "Telephone"               
[21] "Foreign_worker"           "Credit_status" 

데이터셋은 German_credit을 사용합니다.

해당 데이터셋은 1,000개의 관측치와 22개의 변수들로 구성됩니다.

# 범주형 변수 설정
german$Checking_account <- as.factor(german$Checking_account)
german$Credit_history <- as.factor(german$Credit_history)
german$Purpose <- as.factor(german$Purpose)
german$Saving_accout <- as.factor(german$Saving_accout)
german$Present_employment <- as.factor(german$Present_employment)
german$Personal_status___sex <- as.factor(german$Personal_status___sex)
german$Other_debtors_guarantors <- as.factor(german$Other_debtors_guarantors)
german$Property <- as.factor(german$Property)
german$Other_installment_plan <- as.factor(german$Other_installment_plan)
german$Housing <- as.factor(german$Housing)
german$Job <- as.factor(german$Job)
german$Telephone <- as.factor(german$Telephone)
german$Foreign_worker <- as.factor(german$Foreign_worker)

범주형 변수들을 factor 자료형으로 변경합니다.

# Classification Tree
table(german$Credit_status)

library(sampling)
stratified_sampling <- strata(german, stratanames = c("Credit_status"), size =c(300,300),
                              method="srswor")

st_data <- getdata(german, stratified_sampling)
table(st_data$Credit_status)
rm(stratified_sampling)

library(caret)
train <- createDataPartition(st_data$ID, p=0.7, list=FALSE)
td <- st_data[train,]
vd <- st_data[-train,]
rm(st_data, train)

colnames(td)
td <- td[, -c(1,23,24,25)]
vd <- vd[, -c(1,23,24,25)]

저번 포스팅과 마찬가지로 층화추출과 데이터 분할을 진행합니다.

# DT ("C50", "rpart", "CHAID")
# C50 - Entropy
install.packages("C50")
library(C50)
tree1 <- C5.0(Credit_status~., data=td, rules=TRUE)
summary(tree1)
treepred <- predict(tree1, vd, type = "class")
confusionMatrix(treepred, vd$Credit_status)


Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 58 17
         Y 32 73
                                          
               Accuracy : 0.7278          
                 95% CI : (0.6566, 0.7913)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : 4.009e-10       
                                          
                  Kappa : 0.4556          
                                          
 Mcnemar's Test P-Value : 0.0455          
                                          
            Sensitivity : 0.6444          
            Specificity : 0.8111          
         Pos Pred Value : 0.7733          
         Neg Pred Value : 0.6952          
             Prevalence : 0.5000          
         Detection Rate : 0.3222          
   Detection Prevalence : 0.4167          
      Balanced Accuracy : 0.7278          
                                          
       'Positive' Class : N  

Tree 알고리즘은 여러가지가 있습니다.

제가 생각할 때 가장 좋은 알고리즘은 C5.0 입니다.

테스트 정확도는 0.7278 이네요.

# CART - GINI
library(rpart)
tree2 <- rpart(Credit_status~., data=td, method = "class")
summary(tree2)
plot(tree2)
text(tree2, use.n = TRUE, all = TRUE, cex=0.7)

두 번째는 Gini 지수를 불순도로 하는 CART입니다.

treepred <- predict(tree2, vd, type = "class")
confusionMatrix(treepred, vd$Credit_status)


Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 62 18
         Y 28 72
                                          
               Accuracy : 0.7444          
                 95% CI : (0.6742, 0.8064)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : 1.8e-11         
                                          
                  Kappa : 0.4889          
                                          
 Mcnemar's Test P-Value : 0.1845          
                                          
            Sensitivity : 0.6889          
            Specificity : 0.8000          
         Pos Pred Value : 0.7750          
         Neg Pred Value : 0.7200          
             Prevalence : 0.5000          
         Detection Rate : 0.3444          
   Detection Prevalence : 0.4444          
      Balanced Accuracy : 0.7444          
                                          
       'Positive' Class : N               
                                 

테스트 정확도는 0.7444로 C5.0 보다 좋네요.

# CHAID - Chi-square test
# The CHAID package accepts only factor and ordered variable types
install.packages("CHAID", repos="http://R-Forge.R-project.org")
library(CHAID)
colnames(td)
ttd <- td[, -c(2,5,8,11,13,16,18)]
ctrl <- chaid_control(minsplit = 20, minbucket = 5)
tree3 <- chaid(Credit_status~., data=ttd, control = ctrl)
print(tree3)
plot(tree3)

다음은 카이스퀘어 테스트를 진행하는 Chaid 패키지의 의사결정나무 함수입니다.

treepred <- predict(tree3, vd)
confusionMatrix(treepred, vd$Credit_status)


Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 66 27
         Y 24 63
                                          
               Accuracy : 0.7167          
                 95% CI : (0.6448, 0.7812)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : 2.766e-09       
                                          
                  Kappa : 0.4333          
                                          
 Mcnemar's Test P-Value : 0.7794          
                                          
            Sensitivity : 0.7333          
            Specificity : 0.7000          
         Pos Pred Value : 0.7097          
         Neg Pred Value : 0.7241          
             Prevalence : 0.5000          
         Detection Rate : 0.3667          
   Detection Prevalence : 0.5167          
      Balanced Accuracy : 0.7167          
                                          
       'Positive' Class : N   

Test 정확도는 0.7167입니다.

# Regression Tree
library(rpart)
tree2 <- rpart(Credit_amount~., data=td, method = "anova") # F-test
plot(tree2)
text(tree2, use.n = TRUE, all = TRUE, cex=0.7)

마지막은 Regression Tree입니다.

회귀 문제를 풀 때 사용됩니다.

반응형