Skip to content

Latest commit

 

History

History
246 lines (193 loc) · 8.08 KB

6.1-dataset-with-jsonlist.md

File metadata and controls

246 lines (193 loc) · 8.08 KB

数据准备

下载 cifar10 原始数据

打开 cafiar10 数据的官方页面: https://www.cs.toronto.edu/~kriz/cifar.html, 下载页面中的 CIFAR-10 python version .

下载的文件并解压:

$ wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
$ tar zxvf cifar-10-python.tar.gz
$ ls cifar-10-batches-py
batches.meta  data_batch_2  data_batch_4  readme.html
data_batch_1  data_batch_3  data_batch_5  test_batch

将原始数据转换为图片

因为文件是经过 python pickle 库序列化的, 所以需要进行反序列化. 序列化格式如下:

  • data -- a 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image. The image is stored in row-major order, so that the first 32 entries of the array are the red channel values of the first row of the image.
    • The first 1024 entries contain the red channel values
    • The next 1024 the green
    • The final 1024 the blue.
  • labels -- a list of 10000 numbers in the range 0-9. The number at index i indicates the label of the ith image in the array data.

反序列化代码样例如下:

#!/usr/bin/python
# -*- coding: utf8 -*-

import numpy as np
import os,io
import sys
import urllib
from glob2 import glob
import csv
import json
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import image
import cPickle

from collections import OrderedDict # 有序的词典

def unpickle(file):
    import cPickle
    with open(file, 'rb') as fo:
        dict = cPickle.load(fo)
    return dict

# test_batch
test = unpickle('test_batch')
data = test['data']
labels = test['labels']
batch_label = test['batch_label']
filenames = test['filenames']
# batches.meta
label_names = unpickle('batches.meta')['label_names']
'''
print label_names
['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']
'''

#将 array 还原为图片
import matplotlib
def save_image(data, filenames):
    for d, f in zip(data, filenames):
        img = d.reshape(3, 32, 32)
        img = np.swapaxes(img, 0, 2)
        img = np.swapaxes(img, 0, 1)
        assert img.shape == (32,32,3)
        image.imsave(f, img) 

图片上传至七牛 Bucket

首先, 需要下载、安装 qshell, 参考: https://developer.qiniu.com/kodo/tools/1302/qshell

之后, 登陆 portal.qiniu.com, 获取自己账户的 ak/sk

$ qshell account <ak> <sk>
$ qshell qupload 4 ./upload.json

Writing upload log to file /Users/quxiao/.qshell/qupload/5949acac101b55bd7a79a6c30e67a172/5949acac101b55bd7a79a6c30e67a172.log

Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/abandoned_ship_s_000004.png => cifar10-raw-datasetabandoned_ship_s_000004.png [1/50000, 0.0%] ...
Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/abandoned_ship_s_000024.png => cifar10-raw-datasetabandoned_ship_s_000024.png [2/50000, 0.0%] ...
Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/abandoned_ship_s_000034.png => cifar10-raw-datasetabandoned_ship_s_000034.png [3/50000, 0.0%] ...
Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/abandoned_ship_s_000035.png => cifar10-raw-datasetabandoned_ship_s_000035.png [4/50000, 0.0%] ...
Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/abandoned_ship_s_000146.png => cifar10-raw-datasetabandoned_ship_s_000146.png [5/50000, 0.0%] ..

...
Uploading /Users/quxiao/git_rep/github.com/quxiao/cifar10-example/dataset/data/image/yosemite_toad_s_000172.png => cifar10-raw-dataset/yosemite_toad_s_000172.png [50000/50000, 100.0%] ...

See upload log at path /Users/quxiao/.qshell/qupload/5949acac101b55bd7a79a6c30e67a172/5949acac101b55bd7a79a6c30e67a172.log

其中, upload.json 内容为:

{
   "src_dir"            :   "./data/image/",            # cifar10 图片所在目录
   "bucket"             :   "<your_bucket_name>",       # 账户下的 bucket 名称
   "key_prefix"         :   "cifar10-raw-dataset/"      # 指定的 bucket key 前缀
}

生成数据集 json list

AVA 平台数据集 json 格式

对于图片分类问题, 数据集的 json 格式如下:

{
    "url":  "qiniu:///bucketname/prefix/filename", //必填, 仅支持七牛云 Bucket 的图片, 格式可以是图片的http链接和七牛协议. 私有 Bucket 必须使用七牛协议, 格式为qiniu:///bucketname/prefix/filename, 例如qiniu:///newdata/0081.jpg_wh1200.jpg
    "type": "image",// 必填, 目前支持 "image"/"video"
    "source_url":"http://source_url", //图片的来源, augment等操作生成的图片, 来源为原图片url, 下载操作的图片, 来源为原url
    "label": [
       {
           "name":"cifar10",
           "type":"classification",
           "version":"1",
           "data":  [{
              "class": "dog",
            },
            {
                ...
            }]
       },
    ]
}

解析生成 json list 文件的样例代码如下:

# 增加生成图片和 label 的对应文件
def parse_data(data_file, label_file):
    print 'parsing data_file %s' % (data_file)
    p = unpickle(data_file)
    data = p['data']
    labels = p['labels']            # label id list
    filenames = p['filenames']

    print 'save images'
    save_image(data, 'data/image', filenames)

    # save data_filename <-> labels dict
    label_names = unpickle(os.path.dirname(data_file) + '/batches.meta')['label_names']
    print 'label_names: %s' % label_names

    with open(label_file, "a") as out:
        for i in range(len(filenames)):
            fn = filenames[i]
            label_name = label_names[labels[i]]
            out.write('%s\t%s\n' % (fn, label_name))
    print 'done'


bucket_domain='<your_bucket_domain>'
bucket_key_prefix='<your_bucket_key_prefix>'

def load_labels(label_file):
    file_label_dict = dict()
    with open(label_file, 'r') as f:
        for line in f:
            fields = line.split('\t')
            if len(fields) < 2:
                continue
            file_label_dict[fields[0]] = fields[1].strip()
    return file_label_dict

def create_jsonlist_file(image_dir, label_file, jsonlist_file):
    image_paths = glob(image_dir + '/*.png')
    print 'image num: %d' % (len(image_paths))
    file_label_dict = load_labels(label_file)
    json_list = []

    for image_path in image_paths:
        fname = os.path.basename(image_path)
        label_name = file_label_dict.get(fname)
        if label_name == None:
            continue
        print '%s => %s' % (fname, label_name)

        data_json = {
            'url'           : 'http://%s/%s%s' % (bucket_domain, bucket_key_prefix, fname),
            'source_url'    : 'http://%s/%s%s' % (bucket_domain, bucket_key_prefix, fname),
            'type'          : 'image',
            "label": [{
                "name":"cifar10",
                "type":"classification",
                "version":"1",
                "data":  [{
                    "class": label_name,
                }],
            }],
        }
        json_list.append(data_json)
    
    import io
    try:
        to_unicode = unicode
    except NameError:
        to_unicode = str

    with open(jsonlist_file, 'w') as out_f:
        for data_json in json_list:
            line = json.dumps(data_json, ensure_ascii=False) + '\n'
            out_f.write(to_unicode(line))

最后, 将生成的数据集 jsonlist 文件上传至 Bucket, 至此, 数据集的 jsonlist 生成完毕.

创建数据集

在 AVA 平台上, 进入 "数据集" -> "新建数据集" 页面

我们选择本地上传 jsonlist 文件, 填写数据集名称, 之后点击"确定创建". 之后, 数据集会进行"创建中"状态.

数据集状态变为"完成"之后, 可以在详情页中查看统计信息, 看出数据集的分类统计.

格式化数据集

在数据集详情页, 点击 "格式化", 训练集设置为 "70%", 验证集设置为 "30%", 格式化类型设置为 "recordio", 以及 "原图". 之后, 可以在详情页查看到数据格式化的结果.