huichen
diff --git a/‎README.md
Lines changed: 45 additions & 0 deletions b/‎README.md
Lines changed: 45 additions & 0 deletions
diff --git a/‎clustering/README.md
Lines changed: 3 additions & 0 deletions b/‎clustering/README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎contrib/libsvm_dataset_loader.go
Lines changed: 104 additions & 0 deletions b/‎contrib/libsvm_dataset_loader.go
Lines changed: 104 additions & 0 deletions
diff --git a/‎contrib/libsvm_dataset_loader_test.go
Lines changed: 20 additions & 0 deletions b/‎contrib/libsvm_dataset_loader_test.go
Lines changed: 20 additions & 0 deletions
diff --git a/‎contrib/libsvm_dataset_saver.go
Lines changed: 46 additions & 0 deletions b/‎contrib/libsvm_dataset_saver.go
Lines changed: 46 additions & 0 deletions
diff --git a/‎contrib/libsvm_dataset_saver_test.go
Lines changed: 11 additions & 0 deletions b/‎contrib/libsvm_dataset_saver_test.go
Lines changed: 11 additions & 0 deletions
diff --git a/‎contrib/test.txt
Lines changed: 10 additions & 0 deletions b/‎contrib/test.txt
Lines changed: 10 additions & 0 deletions
diff --git a/‎data/dataset.go
Lines changed: 29 additions & 0 deletions b/‎data/dataset.go
Lines changed: 29 additions & 0 deletions
diff --git a/‎data/dataset_iterator.go
Lines changed: 21 additions & 0 deletions b/‎data/dataset_iterator.go
Lines changed: 21 additions & 0 deletions
diff --git a/‎data/dataset_options.go
Lines changed: 20 additions & 0 deletions b/‎data/dataset_options.go
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,45 @@
+弥勒佛
+=====
+
+**让天下没有难做的大数据模型！**
+
+现有的机器学习框架/软件包存在几个问题：
+
+* 无法处理大数据：多数Python，Matlab和R写的训练框架适合处理规模小的样本，没有为大数据优化。
+* 不容易整合到实际生产系统：standalone的程序无法作为library嵌入到大程序中。
+* 模型单一：一个软件包往往只解决一个类型的问题（比如监督式或者非监督式）。
+* 不容易扩展：设计时没有考虑可扩展性，难以添加新的模型和组件。
+* 代码质量不高：代码缺乏规范，难读懂、难维护。
+
+弥勒佛项目的诞生就是为了解决上面的问题，在框架设计上满足了下面几个需求：
+
+* **处理大数据**：可随业务增长scale up，无论你的数据样本是1K还是1B规模，都可使用弥勒佛项目。
+* **为实际生产**：模型的训练和使用都可以作为library或者service整合到在生产系统中。
+* **丰富的模型**：容易尝试不同的模型，在监督、非监督和在线学习等模型间方便地切换。
+* **高度可扩展**：容易添加新模型，方便地对新模型进行实验并迅速整合到生产系统中。
+* **高度可读性**：代码规范，注释和文档尽可能详尽，适合初学者进行大数据模型的学习。
+
+
+# 功能
+
+下面是弥勒佛框架解决的问题类型，括号中的斜体代表尚未实现以及预计实现的时间
+
+* 监督式学习：[最大熵分类模型](/doc/maxent.md)（max entropy classifier），决策树模型（decision tree based models，*2014 Q1*）
+* 非监督式学习：聚类问题（k-means，*2014 Q1*）
+* 在线学习：[在线梯度递降模型](/doc/online.md)（online stochastic gradient descent）
+* 神经网络（*2014 Q2/3*）
+
+项目实现了下面的组件
+
+* 多种[数据集](/doc/dataset.md)（in-mem，skip）
+* 多种[评价器](/doc/eval.md)（precision，recall，f-score，accuracy，confusion）和[交叉评价](/doc/cross_validate.md)（cross-validation）
+* 多种[优化器](/doc/optimizer.md)：协程并发L-BFGS，梯度递降（batch, mini-batch, stochastic），[带退火的学习率](/doc/optimizer.md#学习率)（learning rate），[L1/L2正则化](/doc/optimizer.md#正则化)（regularization）
+* [稀疏向量](/doc/sparse_vector.md)（sparse vector）以存储和表达上亿级别的特征
+* [特征辞典](/doc/dictionary.md)（feature dictionary）在特征名和特征ID之间自动翻译
+
+
+# 其它
+
+* [项目名称来历](/doc/naming.md)
+* [项目邮件列表](https://groups.google.com/forum/#!forum/mlf-users)
+* [联系方式](/doc/feedback.md)
@@ -0,0 +1,3 @@
+需要实现的模型：k-means
+
+预计模型实现的时间：2014 Q1
@@ -0,0 +1,104 @@
+package contrib
+
+import (
+	"github.com/huichen/mlf/data"
+	"github.com/huichen/mlf/util"
+	"io/ioutil"
+	"log"
+	"strconv"
+	"strings"
+)
+
+func LoadLibSVMDataset(path string, usingSparseRepresentation bool) data.Dataset {
+	log.Print("载入libsvm格式文件", path)
+
+	content, err := ioutil.ReadFile(path)
+	if err != nil {
+		log.Fatalf("无法打开文件\"%v\"，错误提示：%v\n", path, err)
+	}
+	lines := strings.Split(string(content), "\n")
+
+	minFeature := 10000
+	maxFeature := 0
+
+	labels := make(map[string]int)
+	labelIndex := 0
+
+	for _, l := range lines {
+		if l == "" {
+			continue
+		}
+
+		fields := strings.Split(l, " ")
+
+		_, ok := labels[fields[0]]
+		if !ok {
+			labels[fields[0]] = labelIndex
+			labelIndex++
+		}
+
+		for i := 1; i < len(fields); i++ {
+			if fields[i] == "" {
+				continue
+			}
+			fs := strings.Split(fields[i], ":")
+			fid, _ := strconv.Atoi(fs[0])
+			if fid > maxFeature {
+				maxFeature = fid
+			}
+			if fid < minFeature {
+				minFeature = fid
+			}
+		}
+	}
+
+	if minFeature != 1 || maxFeature < 2 {
+		log.Fatal("文件输入格式不合法")
+	}
+
+	set := data.NewInmemDataset()
+
+	for _, l := range lines {
+		if l == "" {
+			continue
+		}
+		fields := strings.Split(l, " ")
+
+		instance := new(data.Instance)
+		instance.Output = &data.InstanceOutput{
+			Label:       labels[fields[0]],
+			LabelString: fields[0],
+		}
+		if usingSparseRepresentation {
+			instance.NamedFeatures = make(map[string]float64)
+		} else {
+			instance.Features = util.NewVector(maxFeature + 1)
+		}
+
+		// 常数项
+		if !usingSparseRepresentation {
+			instance.Features.Set(0, 1)
+		}
+
+		for i := 1; i < len(fields); i++ {
+			if fields[i] == "" {
+				continue
+			}
+			fs := strings.Split(fields[i], ":")
+			fid, _ := strconv.Atoi(fs[0])
+			value, _ := strconv.ParseFloat(fs[1], 64)
+			if usingSparseRepresentation {
+				instance.NamedFeatures[fs[0]] = value
+			} else {
+				instance.Features.Set(fid, value)
+			}
+		}
+		set.AddInstance(instance)
+	}
+
+	set.Finalize()
+
+	log.Print("载入数据样本数目 ", set.NumInstances())
+
+	return set
+}
@@ -0,0 +1,20 @@
+package contrib
+
+import (
+	"github.com/huichen/mlf/util"
+	"testing"
+)
+
+func TestLibsvmLoader(t *testing.T) {
+	set := LoadLibSVMDataset("test.txt", false)
+	util.Expect(t, "10", set.NumInstances())
+	util.Expect(t, "45", set.GetOptions().FeatureDimension)
+	util.Expect(t, "2", set.GetOptions().NumLabels)
+}
+
+func TestSparseLibsvmLoader(t *testing.T) {
+	set := LoadLibSVMDataset("test.txt", true)
+	util.Expect(t, "10", set.NumInstances())
+	util.Expect(t, "0", set.GetOptions().FeatureDimension)
+	util.Expect(t, "2", set.GetOptions().NumLabels)
+}
@@ -0,0 +1,46 @@
+package contrib
+
+import (
+	"bufio"
+	"fmt"
+	"github.com/huichen/mlf/data"
+	"log"
+	"os"
+	"strconv"
+)
+
+func SaveLibSVMDataset(path string, set data.Dataset) {
+	log.Print("保存数据集到libsvm格式文件", path)
+
+	f, err := os.Create(path)
+	defer f.Close()
+	if err != nil {
+		log.Fatalf("无法打开文件\"%v\"，错误提示：%v\n", path, err)
+	}
+	w := bufio.NewWriter(f)
+	defer w.Flush()
+
+	iter := set.CreateIterator()
+	iter.Start()
+	for !iter.End() {
+		instance := iter.GetInstance()
+		if instance.Output.LabelString == "" {
+			fmt.Fprintf(w, "%d ", instance.Output.Label)
+		} else {
+			fmt.Fprintf(w, "%s ", instance.Output.LabelString)
+		}
+		for _, k := range instance.Features.Keys() {
+			// 跳过第0个特征，因为它始终是1
+			if k == 0 {
+				continue
+			}
+
+			if instance.Features.Get(k) != 0 {
+				// libsvm格式的特征从1开始
+				fmt.Fprintf(w, "%d:%s ", k, strconv.FormatFloat(instance.Features.Get(k), 'f', -1, 64))
+			}
+		}
+		fmt.Fprint(w, "\n")
+		iter.Next()
+	}
+}
@@ -0,0 +1,11 @@
+package contrib
+
+import (
+	"testing"
+)
+
+func TestLibsvmSaver(t *testing.T) {
+	set := LoadLibSVMDataset("test.txt", true)
+
+	SaveLibSVMDataset("save_test.txt", set)
+}
@@ -0,0 +1,10 @@
+-1 1:0.368684 2:0.141667 3:0.0454545 4:0.184681 5:0.223514 6:0.0716594 7:0.870079 8:0.913386 9:0.582677 10:0.875366 11:1 43:1 
+-1 1:0.365683 2:0.155556 3:0.030303 4:0.151754 5:0.215762 6:0.0547984 7:0.866142 8:0.925197 9:0.594488 10:0.867838 11:1 43:1 
++1 1:0.472736 2:0.386111 3:0.136364 4:0.19184 5:0.307494 6:0.446817 7:0.92126 8:0.937008 9:0.531496 10:0.853339 11:1 26:1 
++1 1:0.463232 2:0.430556 3:0.272727 4:0.173228 5:0.375969 6:0.434172 7:0.937008 8:0.937008 9:0.480315 10:0.865886 11:1 44:1 
+-1 1:0.368184 2:0.125 3:0.030303 4:0.10952 5:0.222222 6:0.0549389 7:0.866142 8:0.92126 9:0.590551 10:0.860449 11:1 43:1 
++1 1:0.36018 2:0.366667 3:0.0909091 4:0.214746 5:0.204134 6:0.00941408 7:0.905512 8:0.933071 9:0.551181 10:0.840792 11:1 43:1 
+-1 1:0.373687 2:0.125 3:0.106061 4:0.193271 5:0.229974 6:0.088942 7:0.874016 8:0.885827 9:0.543307 10:0.872159 11:1 43:1 
+-1 1:0.373187 2:0.136111 3:0.0606061 4:0.167502 5:0.232558 6:0.0805115 7:0.874016 8:0.905512 9:0.566929 10:0.868256 11:1 43:1 
+-1 1:0.37919 2:0.125 3:0.136364 4:0.171797 5:0.295866 6:0.0935788 7:0.877953 8:0.870079 9:0.523622 10:0.870487 11:1 43:1 
+-1 1:0.376688 2:0.163889 3:0.151515 4:0.176807 5:0.237726 6:0.0893635 7:0.897638 8:0.862205 9:0.488189 10:0.868535 11:1 43:1 
@@ -0,0 +1,29 @@
+package data
+
+import (
+	"github.com/huichen/mlf/dictionary"
+)
+
+// Dataset定义了数据集的访问借口
+//
+// 数据集的访问有下面两个特点：
+// 1. 重复访问保证顺序一致。
+// 2. 可以确保每个不同的数据样本（Instance）会被访问一次且仅被访问一次。
+type Dataset interface {
+	// 数据集中的样本数目
+	NumInstances() int
+
+	// 新建一个遍历器
+	CreateIterator() DatasetIterator
+
+	// 得到数据集参数
+	GetOptions() DatasetOptions
+
+	// 得到特征词典
+	// 当不使用特征词典时（直接输入整数特征ID）返回nil
+	GetFeatureDictionary() *dictionary.Dictionary
+
+	// 得到标注词典
+	// 当不使用标注词典时（直接输入整数Label时）返回nil
+	GetLabelDictionary() *dictionary.Dictionary
+}
@@ -0,0 +1,21 @@
+package data
+
+// 数据集遍历器
+type DatasetIterator interface {
+	// 从头开始得到数据
+	Start()
+
+	// 是否抵达数据集的末尾
+	End() bool
+
+	// 跳转到下条数据
+	Next()
+
+	// 跳过n条数据，n>=0，当n为1时等价于Next()
+	Skip(n int)
+
+	// 得到当前数据样本
+	// 请在调用前通过End()检查是否抵达数据集的末尾
+	// 当访问失败或者End()为true时返回nil指针
+	GetInstance() *Instance
+}
@@ -0,0 +1,20 @@
+package data
+
+// 数据集参数
+type DatasetOptions struct {
+	// 特征是否使用稀疏向量存储
+	FeatureIsSparse bool
+
+	// 特征维度，仅当FeatureIsSparse==false时有效
+	FeatureDimension int
+
+	// 是否是监督式学习数据
+	IsSupervisedLearning bool
+
+	// 输出标注数（既分类数目）
+	// 合法的标注值范围为[0, NumLabels-1]
+	NumLabels int
+
+	// 其它自定义的选项
+	Options interface{}
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+需要实现的模型：k-means`
	`2`	`+`
	`3`	`+预计模型实现的时间：2014 Q1`