Skip to content
This repository has been archived by the owner on Nov 19, 2020. It is now read-only.

Commit

Permalink
GH-211: Any samples on how to use Boosted Decision Trees?
Browse files Browse the repository at this point in the history
  • Loading branch information
cesarsouza committed Sep 8, 2017
1 parent 14c4f18 commit 9bfc5e9
Show file tree
Hide file tree
Showing 6 changed files with 378 additions and 98 deletions.
15 changes: 10 additions & 5 deletions Sources/Accord.MachineLearning/DecisionTrees/DecisionTreeHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ namespace Accord.MachineLearning.DecisionTrees

internal static class DecisionTreeHelper
{
public static void CheckArgs(DecisionTree tree, int[][] inputs, int[] outputs)
public static void CheckArgs(DecisionTree tree, int[][] inputs, int[] outputs, double[] weights = null)
{
checkArgs(tree, inputs, outputs);
checkArgs(tree, inputs, outputs, weights);

for (int i = 0; i < inputs.Length; i++)
{
Expand All @@ -60,9 +60,9 @@ public static void CheckArgs(DecisionTree tree, int[][] inputs, int[] outputs)
}
}

public static void CheckArgs(DecisionTree tree, double[][] inputs, int[] outputs)
public static void CheckArgs(DecisionTree tree, double[][] inputs, int[] outputs, double[] weights = null)
{
checkArgs(tree, inputs, outputs);
checkArgs(tree, inputs, outputs, weights);

for (int i = 0; i < inputs.Length; i++)
{
Expand All @@ -84,7 +84,7 @@ public static void CheckArgs(DecisionTree tree, double[][] inputs, int[] outputs
}
}

private static void checkArgs(DecisionTree tree, Array[] inputs, int[] outputs)
private static void checkArgs(DecisionTree tree, Array[] inputs, int[] outputs, double[] weights = null)
{
if (inputs == null)
throw new ArgumentNullException("inputs");
Expand All @@ -100,6 +100,11 @@ private static void checkArgs(DecisionTree tree, Array[] inputs, int[] outputs)
throw new ArgumentOutOfRangeException("inputs",
"Training algorithm needs at least one training vector.");

if (weights != null)
if (inputs.Length != weights.Length)
throw new ArgumentOutOfRangeException("weights",
"The number of input vectors and weights does not match.");

for (int i = 0; i < inputs.Length; i++)
{
if (inputs[i] == null)
Expand Down
104 changes: 60 additions & 44 deletions Sources/Accord.MachineLearning/DecisionTrees/Learning/C45Learning.cs
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,10 @@ private void init(DecisionTree tree)
///
public DecisionTree Learn(double[][] x, int[] y, double[] weights = null)
{
if (weights != null)
throw new ArgumentException(Accord.Properties.Resources.NotSupportedWeights, "weights");

if (Model == null)
init(DecisionTreeHelper.Create(x, y, this.Attributes));

this.run(x, y);
this.run(x, y, weights);
return Model;
}

Expand All @@ -227,13 +224,10 @@ public DecisionTree Learn(double[][] x, int[] y, double[] weights = null)
///
public DecisionTree Learn(int?[][] x, int[] y, double[] weights = null)
{
if (weights != null)
throw new ArgumentException(Accord.Properties.Resources.NotSupportedWeights, "weights");

if (Model == null)
init(DecisionTreeHelper.Create(x, y, this.Attributes));

this.run(x.Apply((xi, i, j) => xi.HasValue ? (double)xi : Double.NaN), y);
this.run(x.Apply((xi, i, j) => xi.HasValue ? (double)xi : Double.NaN), y, weights);
return Model;
}

Expand All @@ -255,7 +249,7 @@ public DecisionTree Learn(int[][] x, int[] y, double[] weights = null)
if (Model == null)
init(DecisionTreeHelper.Create(x, y, this.Attributes));

this.run(x.ToDouble(), y);
this.run(x.ToDouble(), y, weights);
return Model;
}

Expand All @@ -272,17 +266,20 @@ public DecisionTree Learn(int[][] x, int[] y, double[] weights = null)
[Obsolete("Please use Learn(x, y) instead.")]
public double Run(double[][] inputs, int[] outputs)
{
run(inputs, outputs);
run(inputs, outputs, null);
return new ZeroOneLoss(outputs)
{
Mean = true
}.Loss(Model.Decide(inputs));
}

private void run(double[][] inputs, int[] outputs)
private void run(double[][] inputs, int[] outputs, double[] weights)
{
if (weights == null)
weights = Vector.Ones(inputs.Length);

// Initial argument check
DecisionTreeHelper.CheckArgs(Model, inputs, outputs);
DecisionTreeHelper.CheckArgs(Model, inputs, outputs, weights);

// Reset the usage of all attributes
for (int i = 0; i < AttributeUsageCount.Length; i++)
Expand Down Expand Up @@ -335,19 +332,19 @@ private void run(double[][] inputs, int[] outputs)
Model.Root = new DecisionNode(Model);

// Recursively split the tree nodes
split(Model.Root, inputs, outputs, height: 0);
split(Model.Root, inputs, outputs, weights, height: 0);
}

private void split(DecisionNode root, double[][] input, int[] output, int height)
private void split(DecisionNode root, double[][] inputs, int[] outputs, double[] weights, int height)
{
// 2. If all examples are for the same class, return the single-node
// tree with the output label corresponding to this common class.
double entropy = Measures.Entropy(output, Model.NumberOfClasses);
double entropy = Measures.WeightedEntropy(outputs, weights, Model.NumberOfClasses);

if (entropy == 0)
{
if (output.Length > 0)
root.Output = output[0];
if (outputs.Length > 0)
root.Output = outputs[0];
return;
}

Expand All @@ -360,7 +357,7 @@ private void split(DecisionNode root, double[][] input, int[] output, int height

if (candidates.Length == 0 || (MaxHeight > 0 && height == MaxHeight))
{
root.Output = Measures.Mode(output);
root.Output = Measures.WeightedMode(outputs, weights);
return;
}

Expand All @@ -382,7 +379,7 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
// For each attribute in the data set
for (int i = 0; i < scores.Length; i++)
{
scores[i] = computeGainRatio(input, output, candidates[i],
scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
entropy, out partitions[i], out thresholds[i]);
}
}
Expand All @@ -391,7 +388,7 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
// For each attribute in the data set
Parallel.For(0, scores.Length, ParallelOptions, i =>
{
scores[i] = computeGainRatio(input, output, candidates[i],
scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
entropy, out partitions[i], out thresholds[i]);
});
}
Expand All @@ -408,6 +405,7 @@ private void split(DecisionNode root, double[][] input, int[] output, int height

double[][] inputSubset;
int[] outputSubset;
double[] weightSubset;

// Now, create next nodes and pass those partitions as their responsibilities.
if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete)
Expand All @@ -426,9 +424,10 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
Comparison = ComparisonKind.Equal,
};

inputSubset = input.Get(maxGainPartition[i]);
outputSubset = output.Get(maxGainPartition[i]);
split(children[i], inputSubset, outputSubset, height + 1); // recursion
inputSubset = inputs.Get(maxGainPartition[i]);
outputSubset = outputs.Get(maxGainPartition[i]);
weightSubset = weights.Get(maxGainPartition[i]);
split(children[i], inputSubset, outputSubset, weightSubset, height + 1); // recursion
}

root.Branches.AttributeIndex = maxGainAttribute;
Expand Down Expand Up @@ -462,14 +461,16 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
};

// Create a branch for lower values
inputSubset = input.Get(partitionBelowThreshold);
outputSubset = output.Get(partitionBelowThreshold);
split(children[0], inputSubset, outputSubset, height + 1);
inputSubset = inputs.Get(partitionBelowThreshold);
outputSubset = outputs.Get(partitionBelowThreshold);
weightSubset = weights.Get(partitionBelowThreshold);
split(children[0], inputSubset, outputSubset, weightSubset, height + 1);

// Create a branch for higher values
inputSubset = input.Get(partitionAboveThreshold);
outputSubset = output.Get(partitionAboveThreshold);
split(children[1], inputSubset, outputSubset, height + 1);
inputSubset = inputs.Get(partitionAboveThreshold);
outputSubset = outputs.Get(partitionAboveThreshold);
weightSubset = weights.Get(partitionAboveThreshold);
split(children[1], inputSubset, outputSubset, weightSubset, height + 1);

root.Branches.AttributeIndex = maxGainAttribute;
root.Branches.AddRange(children);
Expand All @@ -485,7 +486,7 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
// majority of the currently selected output classes.

var outputIndices = partitionBelowThreshold ?? partitionAboveThreshold;
outputSubset = output.Get(outputIndices);
outputSubset = outputs.Get(outputIndices);
root.Output = Measures.Mode(outputSubset);
}
}
Expand All @@ -494,28 +495,28 @@ private void split(DecisionNode root, double[][] input, int[] output, int height
}


private double computeGainRatio(double[][] input, int[] output, int attributeIndex,
private double computeGainRatio(double[][] input, int[] output, double[] weight, int attributeIndex,
double entropy, out List<int>[] partitions, out double threshold)
{
List<int> missing;
double infoGain = computeInfoGain(input, output, attributeIndex, entropy, out partitions, out missing, out threshold);
double infoGain = computeInfoGain(input, output, weight, attributeIndex, entropy, out partitions, out missing, out threshold);
double splitInfo = SplitInformation(output.Length, partitions, missing);

return infoGain == 0 || splitInfo == 0 ? 0 : infoGain / splitInfo;
}

private double computeInfoGain(double[][] input, int[] output, int attributeIndex,
private double computeInfoGain(double[][] input, int[] output, double[] weight, int attributeIndex,
double entropy, out List<int>[] partitions, out List<int> missing, out double threshold)
{
threshold = 0;

if (Model.Attributes[attributeIndex].Nature == DecisionVariableKind.Discrete)
return entropy - computeInfoDiscrete(input, output, attributeIndex, out partitions, out missing);
return entropy - computeInfoDiscrete(input, output, weight, attributeIndex, out partitions, out missing);

return entropy + computeInfoContinuous(input, output, attributeIndex, out partitions, out missing, out threshold);
return entropy + computeInfoContinuous(input, output, weight, attributeIndex, out partitions, out missing, out threshold);
}

private double computeInfoDiscrete(double[][] input, int[] output,
private double computeInfoDiscrete(double[][] input, int[] output, double[] weight,
int attributeIndex, out List<int>[] partitions, out List<int> missingValues)
{
// Compute the information gain obtained by using
Expand All @@ -542,29 +543,36 @@ private double computeInfoDiscrete(double[][] input, int[] output,
// according to the attribute values
var indicesInPartition = new List<int>();

double weightTotalSum = 0;
double weightSubsetSum = 0;

for (int j = 0; j < input.Length; j++)
{
double x = input[j][attributeIndex];
if (!Double.IsNaN(x) && x == value)
{
indicesInPartition.Add(j);
weightSubsetSum += weight[j];
}
weightTotalSum += weight[j];
}

// For each of the instances under responsibility
// of this node, check which have the same value
int[] outputSubset = output.Get(indicesInPartition);
double[] weightSubset = weight.Get(indicesInPartition);

// Check the entropy gain originating from this partitioning
double e = Measures.Entropy(outputSubset, Model.NumberOfClasses);

info += (outputSubset.Length / (double)output.Length) * e;
double e = Measures.WeightedEntropy(outputSubset, weightSubset, Model.NumberOfClasses);
info += (weightSubsetSum / weightTotalSum) * e;

partitions[i] = indicesInPartition;
}

return info;
}

private double computeInfoContinuous(double[][] input, int[] output,
private double computeInfoContinuous(double[][] input, int[] output, double[] weight,
int attributeIndex, out List<int>[] partitions, out List<int> missingValues, out double threshold)
{
// Compute the information gain obtained by using
Expand Down Expand Up @@ -599,6 +607,9 @@ private double computeInfoContinuous(double[][] input, int[] output,
var output1 = new List<int>(input.Length);
var output2 = new List<int>(input.Length);

var weights1 = new List<double>(input.Length);
var weights2 = new List<double>(input.Length);

// For each possible splitting point of the attribute
for (int i = 0; i < t.Length; i += splitStep)
{
Expand All @@ -618,20 +629,23 @@ private double computeInfoContinuous(double[][] input, int[] output,
{
indicesBelowThreshold.Add(j);
output1.Add(output[j]);
weights1.Add(weight[j]);
}
else if (x > value)
{
indicesAboveThreshold.Add(j);
output2.Add(output[j]);
weights2.Add(weight[j]);
}
}

double p1 = output1.Count / (double)output.Length;
double p2 = output2.Count / (double)output.Length;
double weightSum = weight.Sum();
double p1 = weights1.Sum() / weightSum;
double p2 = weights2.Sum() / weightSum;

double splitGain =
-p1 * Measures.Entropy(output1, Model.NumberOfClasses) +
-p2 * Measures.Entropy(output2, Model.NumberOfClasses);
-p1 * Measures.WeightedEntropy(output1, weights1, Model.NumberOfClasses) +
-p2 * Measures.WeightedEntropy(output2, weights2, Model.NumberOfClasses);

if (splitGain > bestGain)
{
Expand All @@ -655,6 +669,8 @@ private double computeInfoContinuous(double[][] input, int[] output,

output1.Clear();
output2.Clear();
weights1.Clear();
weights2.Clear();
}

threshold = bestThreshold;
Expand Down
Loading

0 comments on commit 9bfc5e9

Please sign in to comment.