-
Notifications
You must be signed in to change notification settings - Fork 153
Iris multi class classification using RandomForest
NOTE: RandomForest is being supported from Hivemall v0.4 or later.
Attribute Information:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica
create database iris;
use iris;
create external table raw (
sepal_length int,
sepal_width int,
petal_length int,
petak_width int,
class string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
create table label_mapping
as
select
class,
rank - 1 as label
from (
select
distinct class,
dense_rank() over (order by class) as rank
from
raw
) t
;
create table training
as
select
rowid() as rowid,
array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features,
t2.label
from
raw t1
JOIN label_mapping t2 ON (t1.class = t2.class)
;
train_randomforest_classifier
takes a dense features
in double[] and a label
starting from 0.
CREATE TABLE model
STORED AS SEQUENCEFILE
AS
select
train_randomforest_classifier(features, label)
-- hivemall v0.4.1-alpha.2 and before
-- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests)
-- hivemall v0.4.1 and later
-- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests)
from
training;
Note: The default TEXTFILE should not be used for model table when using Javascript output through "-output javascript" option.
hive> desc model;
model_id int
model_type int
pred_model string
var_importance array<double>
oob_errors int
oob_tests int
"-help" option shows usage of the function.
select train_randomforest_classifier(features, label, "-help") from training;
> FAILED: UDFArgumentException
usage: train_randomforest_classifier(double[] features, int label [,
string options]) - Returns a relation consists of <int model_id,
int model_type, string pred_model, array<double> var_importance,
int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>]
[-disable_compression] [-help] [-leafs <arg>] [-output <arg>]
[-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars
<arg>]
-attrs,--attribute_types <arg> Comma separated attribute types (Q for
quantitative variable and C for
categorical variable. e.g., [Q,C,Q,C])
-depth,--max_depth <arg> The maximum number of the tree depth
[default: Integer.MAX_VALUE]
-disable_compression Whether to disable compression of the
output script [default: false]
-help Show function help
-leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes
[default: Integer.MAX_VALUE]
-output,--output_type <arg> The output type (serialization/ser or
opscode/vm or javascript/js) [default:
serialization]
-rule,--split_rule <arg> Split algorithm [default: GINI, ENTROPY]
-seed <arg> seed value in long [default: -1
(random)]
-splits,--min_split <arg> A node that has greater than or equals
to `min_split` examples will split
[default: 2]
-trees,--num_trees <arg> The number of trees for each task
[default: 50]
-vars,--num_variables <arg> The number of random selected features
[default: ceil(sqrt(x[0].length))].
int(num_variables * x[0].length) is
considered if num_variable is (0,1]
Caution: "-num_trees" controls the number of trees for each task, not the total number of trees.
To parallelize RandomForest training, you can use UNION ALL as follows:
CREATE TABLE model
STORED AS SEQUENCEFILE
AS
select
train_randomforest_classifier(features, label, '-trees 25')
from
training
UNION ALL
select
train_randomforest_classifier(features, label, '-trees 25')
from
training
;
Variable importance
and Out Of Bag (OOB) error rate
of RandomForest can be shown as follows:
select
array_sum(var_importance) as var_importance,
sum(oob_errors) / sum(oob_tests) as oob_err_rate
from
model;
[2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273] 0.05333333333333334
CREATE TABLE model_javascript
STORED AS SEQUENCEFILE
AS
select train_randomforest_classifier(features, label, "-output_type js -disable_compression")
from training;
select model from model_javascript limit 1;
if(x[3] <= 0.5) {
0;
} else {
if(x[2] <= 4.5) {
if(x[3] <= 1.5) {
if(x[0] <= 4.5) {
1;
} else {
if(x[0] <= 5.5) {
1;
} else {
if(x[1] <= 2.5) {
1;
} else {
1;
}
}
}
} else {
2;
}
} else {
if(x[3] <= 1.5) {
2;
} else {
2;
}
}
}
set hivevar:classification=true;
set hive.auto.convert.join=true;
set hive.mapjoin.optimized.hashtable=false;
create table predicted_vm
as
SELECT
rowid,
rf_ensemble(predicted) as predicted
FROM (
SELECT
rowid,
-- hivemall v0.4.1-alpha.2 and before
-- tree_predict(p.model, t.features, ${classification}) as predicted
-- hivemall v0.4.1 and later
tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
FROM
model p
LEFT OUTER JOIN -- CROSS JOIN
training t
) t1
group by
rowid
;
Note: Javascript outputs can be evaluated by js_tree_predict
.
The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N.
SET hivevar:classification=true;
set hive.auto.convert.join=true;
SET hive.mapjoin.optimized.hashtable=false;
SET mapred.reduce.tasks=8;
create table predicted_vm
as
SELECT
rowid,
rf_ensemble(predicted) as predicted
FROM (
SELECT
t.rowid,
-- hivemall v0.4.1-alpha.2 and before
-- tree_predict(p.pred_model, t.features, ${classification}) as predicted
-- hivemall v0.4.1 and later
tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted
FROM (
SELECT model_id, model_type, pred_model
FROM model
DISTRIBUTE BY rand(1)
) p
LEFT OUTER JOIN training t
) t1
group by
rowid
;
select count(1) from training;
> 150
set hivevar:total_cnt=150;
WITH t1 as (
SELECT
t.rowid,
t.label as actual,
p.predicted.label as predicted
FROM
predicted_vm p
LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
)
SELECT
count(1) / ${total_cnt}
FROM
t1
WHERE
actual = predicted
;
0.9533333333333334