-
Notifications
You must be signed in to change notification settings - Fork 73
/
ffm.py
121 lines (93 loc) · 3.74 KB
/
ffm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import graphlab as gl
import libffm
class FFM(object):
def __init__(self, eta=.1, k=4, lam=0.0):
"""
Create a field-aware factorization machine model.
Parameters
----------
eta : float
Learning rate.
k : int
Number of latent factors.
lam : float
Regularization parameter.
References
----------
- `libffm: open source C++ library
<http://www.csie.ntu.edu.tw/~r01922136/libffm/`_
- `FFM formulation details <http://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf>`
- `Criteo winning submission details <http://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf>`
"""
self.m = libffm.ffm_py()
self.m.init_model(eta, lam, k)
def fit(self, train, validation_set=None,
target='target', features=None,
max_feature_id=2**18,
nr_iters=15, nr_threads=1,
quiet=False):
"""
Train the model.
Parameters
----------
train : SFrame
A training dataset containing a prediction target and feature columns
that are dict type. Each column will be considered a "field" in
the model. Each column element be a dictionary with integer keys
and float values.
validation_set : SFrame, optional
A validation set to use for progress reporting. This should have the
same format as the training data.
target : str
The name of the column to predict. This column should be float typed.
features : list
The name of the feature columns that you want to use.
nr_iters : int
The number of iterations to train the model.
nr_threads : int
The number of the threads to use.
quiet : boolean
If true, algorithm will report progress.
normalization : boolean
If true, the algorithm will perform instance-wise normalization.
random : boolean
If true, the rows will be shuffled prior to training.
Returns
-------
None
Note
----
The original library has two additional options that have not (yet)
been exposed in this library:
- random: rows can are processed in random order. When using this
wrapper, it's best to shuffle ahead of time.
- normalization: sometimes this algorithm benefits from normalizing the
values row-wise. This wrapper currently requires you
to do that ahead of time.
"""
if target not in train.column_names():
raise ValueError, "Target column `{0}` not found in dataset.".format(target)
if validation_set is not None:
if train.column_names() != validation_set.column_names():
raise ValueError, "Train, validation data must have the same column names."
else:
validation_set = train.head(0)
if features is None:
features = [c for c in train.column_names() if c is not target]
self.m.set_param(nr_iters, nr_threads, quiet)
self.m.fit(train, validation_set, target, features, max_feature_id)
def predict(self, test):
"""
Make predictions on a test set.
Parameters
----------
test : SFrame
This should be in the same format as the training data. This ignores
any columns having the same name as the target used during training.
Returns
-------
out : SArray
An SArray of predictions. This should have the same length as the
number of rows in the provided `test` SFrame.
"""
return self.m.predict(test)