Skip to content
This repository has been archived by the owner on Oct 8, 2019. It is now read-only.

news20 multiclass dataset (preparation for one vs the rest classifiers)

Makoto YUI edited this page May 17, 2016 · 6 revisions

One-vs-the-rest is a multiclass classification method that uses binary classifiers independently for each class. http://en.wikipedia.org/wiki/Multiclass_classification#one_vs_all

UDF preparation

delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;

source /home/myui/tmp/define-all.hive;

Dataset preparation for one-vs-the-rest classifiers

select collect_set(label) from news20mc_train;

[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20]

SET hivevar:possible_labels="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20";

one-vs-rest.awk

create or replace view news20_onevsrest_train
as
select transform(${possible_labels}, rowid, label, addBias(features))
  ROW FORMAT DELIMITED
    FIELDS TERMINATED BY "\t"
    COLLECTION ITEMS TERMINATED BY ","
    LINES TERMINATED BY "\n"
using 'gawk -f one-vs-rest.awk'
  as (rowid BIGINT, label INT, target INT, features ARRAY<STRING>)
  ROW FORMAT DELIMITED
    FIELDS TERMINATED BY "\t"
    COLLECTION ITEMS TERMINATED BY ","
    LINES TERMINATED BY "\n"
from news20mc_train;

create or replace view news20_onevsrest_train_x3
as
select
 *
from (
  select
    amplify(3, *) as (rowid, label, target, features)
  from
    news20_onevsrest_train
) t
CLUSTER BY rand();
Clone this wiki locally