We saw in Part 4 how to build a decision tree predictor. We are now going to create a predictor from a very classic machine learning data set, the Iris data set.

## A decision tree classifier on the Iris data set

We add to the LabeledData class two methods: randomSplit and evaluate. The purpose of the evaluate method is to evaluate the predictions of a tree with respect to a given metric on the corresponding data set.

/** The building blocks of a binary decision tree being leaves and nodes,
* we create two Scala classes Leaf and Node that extend a trait DecisionTree.
*/
trait DecisionTree[-A, +B] {
def predict(sample: A): B
}

/** A leaf is a simple element that provides a prediction value (or decision). */
case class Leaf[A, B](decision: B) extends DecisionTree[A, B] {
def predict(sample: A): B = decision
}

/** A node stores a test function as well as to which node or leaf to go next depending on
* the result of the test.
* We define as:
*   - *right* the leaf or node to go to when the test is true,
*   - *left* the leaf or node to go to when the test is false.
*/
case class Node[A, B](test: A => Boolean, left: DecisionTree[A, B], right: DecisionTree[A, B])
extends DecisionTree[A, B] {

def predict(sample: A): B = test(sample) match {
case true => right.predict(sample)
case false => left.predict(sample)
}
}

case class LabelCombiner[B](combine: Vector[B] => B) {

def combine(left: B, right: B): B = combine(Vector(left, right))
}

import scala.util.Random

class LabeledData[A, B] private (
private val referenceSamples: Vector[A],
private val referenceLabels: Vector[B],
private val indices: Vector[Int]) {

def size: Int = indices.length

def isEmpty: Boolean = size == 0

def subset(indices: Vector[Int]): LabeledData[A, B] =
new LabeledData(referenceSamples, referenceLabels, indices)

def emptySet: LabeledData[A, B] = subset(Vector())

def groupBy[C](f: A => C): Map[C, LabeledData[A, B]] =
((indices groupBy {idx => f(referenceSamples(idx))})
mapValues subset)

def partition(f: A => Boolean): (LabeledData[A, B], LabeledData[A, B]) = {
val groups = groupBy(f)
(groups(true), groups(false))
}

def countDistinctLabels: Int =
(indices map referenceLabels).distinct.length

def mapSamples[C](f: A => C): Vector[C] =
indices map {idx => f(referenceSamples(idx))}

def combineLabels(labelCombiner: LabelCombiner[B]): B =
labelCombiner.combine(indices map referenceLabels)

def randomSplit(seed: Long)(ratios: Double*): Vector[LabeledData[A, B]] = {
val total = ratios.sum
val numberItems = ratios.toVector map {r => (r / total * size).toInt}
val rand = new Random(seed)
val zero = (Vector[Int](), rand.shuffle((0 until size).toVector))
numberItems.scanLeft(zero)((s, n) => s._2.splitAt(n)).drop(1).map(_._1).map(subset)
}

def evaluate(tree: DecisionTree[A, B], metric: (Vector[B], Vector[B]) => Double): Double = {
val predictions = mapSamples(tree.predict)
metric(predictions, indices map referenceLabels)
}
}

object LabeledData {

def apply[A, B](samples: Vector[A], labels: Vector[B]):
LabeledData[A, B] = {
require(
samples.length == labels.length,
"Samples and labels should have the same number of elements.")
new LabeledData[A, B](samples, labels, samples.indices.toVector)
}
}
$line25.$read$$iw$$iw$LabeledData$@596e455d

/** We associate to each leaf or node a unique id *number*,
* and also store the depth of the leaf or node.
*
* For each node, the next left and right nodes or leaves ids
* by considering that each level of the tree is full. The id
* numbers represent all the possible branching.
* By starting with a root node id number of 0, we get the
* following numbering for the first levels of the tree:
*                  0
*         1                2
*      3     4          5     6
*     7 8   9 10      11 12 13 14
*/
case class Id(number: Long, depth: Int) {
require(number >= 0, "Id number should be nonnegative.")

def nextIdLeft: Id = Id(number * 2 + 1, depth + 1)

def nextIdRight: Id = Id(number * 2 + 2, depth + 1)
}

case class DecisionTreeBuilder[A, B](dataSet: LabeledData[A, B])
(combiner: LabelCombiner[B]) {

def build(testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
DecisionTree[A, B] = {
val rootId: Id = Id(0, 0)
buildStep(dataSet, rootId, testBuilder)
}

private def buildStep(
subSet: LabeledData[A, B],
id: Id,
testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
DecisionTree[A, B] = {

testBuilder(subSet, id) match {
case Left(_) => Leaf[A, B](subSet.combineLabels(combiner))
case Right(test) =>
val Seq(leftSubSet, rightSubSet) = nextSubsets(subSet, test)
if (leftSubSet.isEmpty || rightSubSet.isEmpty) {
println("Warning: one of right or left indices is empty.")
Leaf[A, B](subSet.combineLabels(combiner))
}
else Node(
test,
buildStep(leftSubSet, id.nextIdLeft, testBuilder),
buildStep(rightSubSet, id.nextIdRight, testBuilder)
)
}
}

private def nextSubsets(subSet: LabeledData[A, B], test: A => Boolean):
Seq[LabeledData[A, B]] = {
val groups = subSet.groupBy(test) withDefaultValue subSet.emptySet
Seq(groups(false), groups(true))
}
}
defined class Id
defined class DecisionTreeBuilder


We now build a decision tree from the Iris data set. We split the data set into two random sets, the training set and test set being of respective sizes of 70% and 30%.

// The data set can be downloaded from the UCI Machine Learning Repository: http://archive.ics.uci.edu/ml
// (University of California, School of Information and Computer Science)
val url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
val data = scala.io.Source.fromURL(url).getLines.withFilter(_.nonEmpty).toVector
val splitData = (data map { _.split(',').toVector.splitAt(4) }).unzip
val (samples, labels) = (splitData._1 map { v => v map (_.toDouble) }, splitData._2 map {_ (0) })
val dataSet = LabeledData[Vector[Double], String](samples, labels)

val Vector(trainSet, testSet) = dataSet.randomSplit(seed=0)(70, 30)

val combiner: LabelCombiner[String] =
LabelCombiner(v => {
val groups = v groupBy identity
groups.maxBy(_._2.length)._1})

val treeBuilder = DecisionTreeBuilder(trainSet)(combiner)

def testBuilder(subSet: LabeledData[Vector[Double], String], id: Id):
Either[String, Vector[Double] => Boolean] = {
if (id.depth > 3) Left("Maximal depth reached.")
else if (subSet.size < 10) Left("Number of elements per leaf reached.")
else if (subSet.countDistinctLabels == 1) Left("All elements have the same label.")
else {
/* The Iris data set has 4 features. The data is splitted feature after feature
by taking the values below and above (3 * min + max) / 4
*/
val i: Int = (id.number % 4).toInt
val v = subSet.mapSamples(_(i))
val m = v.foldLeft((v(0), v(0)))((acc, si) => (acc._1 min si, acc._2 max si))
if (m._1 == m._2) Left("Cannot split node.")
else Right((x: Vector[Double]) => x(i) > (3 * m._1 + m._2) / 4)
}
}

val tree = treeBuilder.build(testBuilder)
def accuracy[B](predictions: IndexedSeq[B], labels: IndexedSeq[B]): Double =
((predictions zip labels) map (x => if (x._1 == x._2) 1.0 else 0.0)).sum / labels.length
s"Test set accuracy : \${testSet.evaluate(tree, accuracy)}"
Test set accuracy : 0.9777777777777777


Using our simple decision tree builder, we are able to get an accuracy of 97.77% on a random test set of the Iris data set.