# 使用Go 机器学习库来进行数据分析 2 (决策树)

## 决策树和随机森林

golearn支持两种决策树算法。ID3和RandomTree。

• ID3: 以信息增益为准则选择信息增益最大的属性。

ID3 is a decision tree induction algorithm which splits on the Attribute which gives the greatest Information Gain (entropy gradient). It performs well on categorical data. Numeric datasets will need to be discretised before using ID3

• RandomTree: 与ID3类似，但是选择的属性的时候随机选择。

Random Trees are structurally identical to those generated by ID3, but the split Attribute is chosen randomly. Golearn's implementation allows you to choose up to k nodes for consideration at each split.

## 代码

// Demonstrates decision tree classification

package main

import (
"fmt"
"github.com/sjwhitworth/golearn/base"
"github.com/sjwhitworth/golearn/ensemble"
"github.com/sjwhitworth/golearn/evaluation"
"github.com/sjwhitworth/golearn/filters"
"github.com/sjwhitworth/golearn/trees"
"math/rand"
)

func main() {

var tree base.Classifier

rand.Seed(44111342)

// Load in the iris dataset
if err != nil {
panic(err)
}

// Discretise the iris dataset with Chi-Merge
filt := filters.NewChiMergeFilter(iris,0.999)
for _, a := range base.NonClassFloatAttributes(iris) {
}
filt.Train()
irisf := base.NewLazilyFilteredInstances(iris, filt)

// Create a 60-40 training-test split
trainData, testData := base.InstancesTrainTestSplit(irisf,0.60)

//
// First up, use ID3
//
tree = trees.NewID3DecisionTree(0.6)
// (Parameter controls train-prune split.)

// Train the ID3 tree
err = tree.Fit(trainData)
if err != nil {
panic(err)
}

// Generate predictions
predictions, err := tree.Predict(testData)
if err != nil {
panic(err)
}

// Evaluate
fmt.Println("ID3 Performance (information gain)")
cf, err := evaluation.GetConfusionMatrix(testData, predictions)
if err != nil {
panic(fmt.Sprintf("Unable to get confusion matrix: %s", err.Error()))
}
fmt.Println(evaluation.GetSummary(cf))

tree = trees.NewID3DecisionTreeFromRule(0.6, new(trees.InformationGainRatioRuleGenerator))
// (Parameter controls train-prune split.)

// Train the ID3 tree
err = tree.Fit(trainData)
if err != nil {
panic(err)
}

// Generate predictions
predictions, err = tree.Predict(testData)
if err != nil {
panic(err)
}

// Evaluate
fmt.Println("ID3 Performance (information gain ratio)")
cf, err = evaluation.GetConfusionMatrix(testData, predictions)
if err != nil {
panic(fmt.Sprintf("Unable to get confusion matrix: %s", err.Error()))
}
fmt.Println(evaluation.GetSummary(cf))

tree = trees.NewID3DecisionTreeFromRule(0.6, new(trees.GiniCoefficientRuleGenerator))
// (Parameter controls train-prune split.)

// Train the ID3 tree
err = tree.Fit(trainData)
if err != nil {
panic(err)
}

// Generate predictions
predictions, err = tree.Predict(testData)
if err != nil {
panic(err)
}

// Evaluate
fmt.Println("ID3 Performance (gini index generator)")
cf, err = evaluation.GetConfusionMatrix(testData, predictions)
if err != nil {
panic(fmt.Sprintf("Unable to get confusion matrix: %s", err.Error()))
}
fmt.Println(evaluation.GetSummary(cf))
//
// Next up, Random Trees
//

// Consider two randomly-chosen attributes
tree = trees.NewRandomTree(2)
err = tree.Fit(trainData)
if err != nil {
panic(err)
}
predictions, err = tree.Predict(testData)
if err != nil {
panic(err)
}
fmt.Println("RandomTree Performance")
cf, err = evaluation.GetConfusionMatrix(testData, predictions)
if err != nil {
panic(fmt.Sprintf("Unable to get confusion matrix: %s", err.Error()))
}
fmt.Println(evaluation.GetSummary(cf))

//
// Finally, Random Forests
//
tree = ensemble.NewRandomForest(70,3)
err = tree.Fit(trainData)
if err != nil {
panic(err)
}
predictions, err = tree.Predict(testData)
if err != nil {
panic(err)
}
fmt.Println("RandomForest Performance")
cf, err = evaluation.GetConfusionMatrix(testData, predictions)
if err != nil {
panic(fmt.Sprintf("Unable to get confusion matrix: %s", err.Error()))
}
fmt.Println(evaluation.GetSummary(cf))
}


## 评估结果

ID3 Performance (information gain)
Reference Class True Positives False Positives True Negatives Precision Recall F1 Score
---------------------------------------------------------------------------------
Iris-virginica 32 5 46 0.8649 0.9697 0.9143
Iris-versicolor 4 1 61 0.8000 0.1818 0.2963
Iris-setosa 29 13 42 0.6905 1.0000 0.8169
Overall accuracy: 0.7738

ID3 Performance (information gain ratio)
Reference Class True Positives False Positives True Negatives Precision Recall F1 Score
---------------------------------------------------------------------------------
Iris-virginica 29 3 48 0.9062 0.8788 0.8923
Iris-versicolor 5 3 59 0.6250 0.2273 0.3333
Iris-setosa 29 15 40 0.6591 1.0000 0.7945
Overall accuracy: 0.7500

ID3 Performance (gini index generator)
Reference Class True Positives False Positives True Negatives Precision Recall F1 Score
---------------------------------------------------------------------------------
Iris-virginica 26 5 46 0.8387 0.7879 0.8125
Iris-versicolor 17 36 26 0.3208 0.7727 0.4533
Iris-setosa 0 0 55 NaN 0.0000 NaN
Overall accuracy: 0.5119

RandomTree Performance
Reference Class True Positives False Positives True Negatives Precision Recall F1 Score
---------------------------------------------------------------------------------
Iris-virginica 30 3 48 0.9091 0.9091 0.9091
Iris-versicolor 9 3 59 0.7500 0.4091 0.5294
Iris-setosa 29 10 45 0.7436 1.0000 0.8529
Overall accuracy: 0.8095

RandomForest Performance
Reference Class True Positives False Positives True Negatives Precision Recall F1 Score
---------------------------------------------------------------------------------
Iris-virginica 31 8 43 0.7949 0.9394 0.8611
Iris-versicolor 0 0 62 NaN 0.0000 NaN
Iris-setosa 29 16 39 0.6444 1.0000 0.7838
Overall accuracy: 0.7143