Skip to content

Commit 42be54f

Browse files
committed
第一发
0 parents  commit 42be54f

File tree

101 files changed

+144458
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+144458
-0
lines changed

README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
弥勒佛
2+
=====
3+
4+
**让天下没有难做的大数据模型!**
5+
6+
现有的机器学习框架/软件包存在几个问题:
7+
8+
* 无法处理大数据:多数Python,Matlab和R写的训练框架适合处理规模小的样本,没有为大数据优化。
9+
* 不容易整合到实际生产系统:standalone的程序无法作为library嵌入到大程序中。
10+
* 模型单一:一个软件包往往只解决一个类型的问题(比如监督式或者非监督式)。
11+
* 不容易扩展:设计时没有考虑可扩展性,难以添加新的模型和组件。
12+
* 代码质量不高:代码缺乏规范,难读懂、难维护。
13+
14+
弥勒佛项目的诞生就是为了解决上面的问题,在框架设计上满足了下面几个需求:
15+
16+
* **处理大数据**:可随业务增长scale up,无论你的数据样本是1K还是1B规模,都可使用弥勒佛项目。
17+
* **为实际生产**:模型的训练和使用都可以作为library或者service整合到在生产系统中。
18+
* **丰富的模型**:容易尝试不同的模型,在监督、非监督和在线学习等模型间方便地切换。
19+
* **高度可扩展**:容易添加新模型,方便地对新模型进行实验并迅速整合到生产系统中。
20+
* **高度可读性**:代码规范,注释和文档尽可能详尽,适合初学者进行大数据模型的学习。
21+
22+
23+
# 功能
24+
25+
下面是弥勒佛框架解决的问题类型,括号中的斜体代表尚未实现以及预计实现的时间
26+
27+
* 监督式学习:[最大熵分类模型](/doc/maxent.md)(max entropy classifier),决策树模型(decision tree based models,*2014 Q1*
28+
* 非监督式学习:聚类问题(k-means,*2014 Q1*
29+
* 在线学习:[在线梯度递降模型](/doc/online.md)(online stochastic gradient descent)
30+
* 神经网络(*2014 Q2/3*
31+
32+
项目实现了下面的组件
33+
34+
* 多种[数据集](/doc/dataset.md)(in-mem,skip)
35+
* 多种[评价器](/doc/eval.md)(precision,recall,f-score,accuracy,confusion)和[交叉评价](/doc/cross_validate.md)(cross-validation)
36+
* 多种[优化器](/doc/optimizer.md):协程并发L-BFGS,梯度递降(batch, mini-batch, stochastic),[带退火的学习率](/doc/optimizer.md#学习率)(learning rate),[L1/L2正则化](/doc/optimizer.md#正则化)(regularization)
37+
* [稀疏向量](/doc/sparse_vector.md)(sparse vector)以存储和表达上亿级别的特征
38+
* [特征辞典](/doc/dictionary.md)(feature dictionary)在特征名和特征ID之间自动翻译
39+
40+
41+
# 其它
42+
43+
* [项目名称来历](/doc/naming.md)
44+
* [项目邮件列表](https://groups.google.com/forum/#!forum/mlf-users)
45+
* [联系方式](/doc/feedback.md)

clustering/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
需要实现的模型:k-means
2+
3+
预计模型实现的时间:2014 Q1

contrib/libsvm_dataset_loader.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package contrib
2+
3+
import (
4+
"github.com/huichen/mlf/data"
5+
"github.com/huichen/mlf/util"
6+
"io/ioutil"
7+
"log"
8+
"strconv"
9+
"strings"
10+
)
11+
12+
func LoadLibSVMDataset(path string, usingSparseRepresentation bool) data.Dataset {
13+
log.Print("载入libsvm格式文件", path)
14+
15+
content, err := ioutil.ReadFile(path)
16+
if err != nil {
17+
log.Fatalf("无法打开文件\"%v\",错误提示:%v\n", path, err)
18+
}
19+
lines := strings.Split(string(content), "\n")
20+
21+
minFeature := 10000
22+
maxFeature := 0
23+
24+
labels := make(map[string]int)
25+
labelIndex := 0
26+
27+
for _, l := range lines {
28+
if l == "" {
29+
continue
30+
}
31+
32+
fields := strings.Split(l, " ")
33+
34+
_, ok := labels[fields[0]]
35+
if !ok {
36+
labels[fields[0]] = labelIndex
37+
labelIndex++
38+
}
39+
40+
for i := 1; i < len(fields); i++ {
41+
if fields[i] == "" {
42+
continue
43+
}
44+
fs := strings.Split(fields[i], ":")
45+
fid, _ := strconv.Atoi(fs[0])
46+
if fid > maxFeature {
47+
maxFeature = fid
48+
}
49+
if fid < minFeature {
50+
minFeature = fid
51+
}
52+
}
53+
}
54+
55+
if minFeature != 1 || maxFeature < 2 {
56+
log.Fatal("文件输入格式不合法")
57+
}
58+
59+
set := data.NewInmemDataset()
60+
61+
for _, l := range lines {
62+
if l == "" {
63+
continue
64+
}
65+
fields := strings.Split(l, " ")
66+
67+
instance := new(data.Instance)
68+
instance.Output = &data.InstanceOutput{
69+
Label: labels[fields[0]],
70+
LabelString: fields[0],
71+
}
72+
if usingSparseRepresentation {
73+
instance.NamedFeatures = make(map[string]float64)
74+
} else {
75+
instance.Features = util.NewVector(maxFeature + 1)
76+
}
77+
78+
// 常数项
79+
if !usingSparseRepresentation {
80+
instance.Features.Set(0, 1)
81+
}
82+
83+
for i := 1; i < len(fields); i++ {
84+
if fields[i] == "" {
85+
continue
86+
}
87+
fs := strings.Split(fields[i], ":")
88+
fid, _ := strconv.Atoi(fs[0])
89+
value, _ := strconv.ParseFloat(fs[1], 64)
90+
if usingSparseRepresentation {
91+
instance.NamedFeatures[fs[0]] = value
92+
} else {
93+
instance.Features.Set(fid, value)
94+
}
95+
}
96+
set.AddInstance(instance)
97+
}
98+
99+
set.Finalize()
100+
101+
log.Print("载入数据样本数目 ", set.NumInstances())
102+
103+
return set
104+
}

contrib/libsvm_dataset_loader_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package contrib
2+
3+
import (
4+
"github.com/huichen/mlf/util"
5+
"testing"
6+
)
7+
8+
func TestLibsvmLoader(t *testing.T) {
9+
set := LoadLibSVMDataset("test.txt", false)
10+
util.Expect(t, "10", set.NumInstances())
11+
util.Expect(t, "45", set.GetOptions().FeatureDimension)
12+
util.Expect(t, "2", set.GetOptions().NumLabels)
13+
}
14+
15+
func TestSparseLibsvmLoader(t *testing.T) {
16+
set := LoadLibSVMDataset("test.txt", true)
17+
util.Expect(t, "10", set.NumInstances())
18+
util.Expect(t, "0", set.GetOptions().FeatureDimension)
19+
util.Expect(t, "2", set.GetOptions().NumLabels)
20+
}

contrib/libsvm_dataset_saver.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package contrib
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"github.com/huichen/mlf/data"
7+
"log"
8+
"os"
9+
"strconv"
10+
)
11+
12+
func SaveLibSVMDataset(path string, set data.Dataset) {
13+
log.Print("保存数据集到libsvm格式文件", path)
14+
15+
f, err := os.Create(path)
16+
defer f.Close()
17+
if err != nil {
18+
log.Fatalf("无法打开文件\"%v\",错误提示:%v\n", path, err)
19+
}
20+
w := bufio.NewWriter(f)
21+
defer w.Flush()
22+
23+
iter := set.CreateIterator()
24+
iter.Start()
25+
for !iter.End() {
26+
instance := iter.GetInstance()
27+
if instance.Output.LabelString == "" {
28+
fmt.Fprintf(w, "%d ", instance.Output.Label)
29+
} else {
30+
fmt.Fprintf(w, "%s ", instance.Output.LabelString)
31+
}
32+
for _, k := range instance.Features.Keys() {
33+
// 跳过第0个特征,因为它始终是1
34+
if k == 0 {
35+
continue
36+
}
37+
38+
if instance.Features.Get(k) != 0 {
39+
// libsvm格式的特征从1开始
40+
fmt.Fprintf(w, "%d:%s ", k, strconv.FormatFloat(instance.Features.Get(k), 'f', -1, 64))
41+
}
42+
}
43+
fmt.Fprint(w, "\n")
44+
iter.Next()
45+
}
46+
}

contrib/libsvm_dataset_saver_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package contrib
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestLibsvmSaver(t *testing.T) {
8+
set := LoadLibSVMDataset("test.txt", true)
9+
10+
SaveLibSVMDataset("save_test.txt", set)
11+
}

contrib/test.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-1 1:0.368684 2:0.141667 3:0.0454545 4:0.184681 5:0.223514 6:0.0716594 7:0.870079 8:0.913386 9:0.582677 10:0.875366 11:1 43:1
2+
-1 1:0.365683 2:0.155556 3:0.030303 4:0.151754 5:0.215762 6:0.0547984 7:0.866142 8:0.925197 9:0.594488 10:0.867838 11:1 43:1
3+
+1 1:0.472736 2:0.386111 3:0.136364 4:0.19184 5:0.307494 6:0.446817 7:0.92126 8:0.937008 9:0.531496 10:0.853339 11:1 26:1
4+
+1 1:0.463232 2:0.430556 3:0.272727 4:0.173228 5:0.375969 6:0.434172 7:0.937008 8:0.937008 9:0.480315 10:0.865886 11:1 44:1
5+
-1 1:0.368184 2:0.125 3:0.030303 4:0.10952 5:0.222222 6:0.0549389 7:0.866142 8:0.92126 9:0.590551 10:0.860449 11:1 43:1
6+
+1 1:0.36018 2:0.366667 3:0.0909091 4:0.214746 5:0.204134 6:0.00941408 7:0.905512 8:0.933071 9:0.551181 10:0.840792 11:1 43:1
7+
-1 1:0.373687 2:0.125 3:0.106061 4:0.193271 5:0.229974 6:0.088942 7:0.874016 8:0.885827 9:0.543307 10:0.872159 11:1 43:1
8+
-1 1:0.373187 2:0.136111 3:0.0606061 4:0.167502 5:0.232558 6:0.0805115 7:0.874016 8:0.905512 9:0.566929 10:0.868256 11:1 43:1
9+
-1 1:0.37919 2:0.125 3:0.136364 4:0.171797 5:0.295866 6:0.0935788 7:0.877953 8:0.870079 9:0.523622 10:0.870487 11:1 43:1
10+
-1 1:0.376688 2:0.163889 3:0.151515 4:0.176807 5:0.237726 6:0.0893635 7:0.897638 8:0.862205 9:0.488189 10:0.868535 11:1 43:1

data/dataset.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package data
2+
3+
import (
4+
"github.com/huichen/mlf/dictionary"
5+
)
6+
7+
// Dataset定义了数据集的访问借口
8+
//
9+
// 数据集的访问有下面两个特点:
10+
// 1. 重复访问保证顺序一致。
11+
// 2. 可以确保每个不同的数据样本(Instance)会被访问一次且仅被访问一次。
12+
type Dataset interface {
13+
// 数据集中的样本数目
14+
NumInstances() int
15+
16+
// 新建一个遍历器
17+
CreateIterator() DatasetIterator
18+
19+
// 得到数据集参数
20+
GetOptions() DatasetOptions
21+
22+
// 得到特征词典
23+
// 当不使用特征词典时(直接输入整数特征ID)返回nil
24+
GetFeatureDictionary() *dictionary.Dictionary
25+
26+
// 得到标注词典
27+
// 当不使用标注词典时(直接输入整数Label时)返回nil
28+
GetLabelDictionary() *dictionary.Dictionary
29+
}

data/dataset_iterator.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package data
2+
3+
// 数据集遍历器
4+
type DatasetIterator interface {
5+
// 从头开始得到数据
6+
Start()
7+
8+
// 是否抵达数据集的末尾
9+
End() bool
10+
11+
// 跳转到下条数据
12+
Next()
13+
14+
// 跳过n条数据,n>=0,当n为1时等价于Next()
15+
Skip(n int)
16+
17+
// 得到当前数据样本
18+
// 请在调用前通过End()检查是否抵达数据集的末尾
19+
// 当访问失败或者End()为true时返回nil指针
20+
GetInstance() *Instance
21+
}

data/dataset_options.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package data
2+
3+
// 数据集参数
4+
type DatasetOptions struct {
5+
// 特征是否使用稀疏向量存储
6+
FeatureIsSparse bool
7+
8+
// 特征维度,仅当FeatureIsSparse==false时有效
9+
FeatureDimension int
10+
11+
// 是否是监督式学习数据
12+
IsSupervisedLearning bool
13+
14+
// 输出标注数(既分类数目)
15+
// 合法的标注值范围为[0, NumLabels-1]
16+
NumLabels int
17+
18+
// 其它自定义的选项
19+
Options interface{}
20+
}

0 commit comments

Comments
 (0)