Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FastAI datasets #56

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@ version = "0.5.5"
[deps]
BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
FileTrees = "72696420-646e-6120-6e77-6f6420746567"
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
LearnBase = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6"
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"

[compat]
Expand Down
57 changes: 57 additions & 0 deletions src/FastAI/FastAIDatasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
module FastAIDatasets



using DataDeps
using FilePathsBase
using FilePathsBase: filename
import FileIO
using FileTrees
using MLDataPattern
using MLDataPattern: splitobs
import LearnBase
using Colors
using FixedPointNumbers

include("fastaidatasets.jl")

function __init__()
initdatadeps()
end

include("containers.jl")
include("transformations.jl")
include("load.jl")


export
# reexports from MLDataPattern
splitobs,
getobs,
nobs,

# container transformations
mapobs,
filterobs,
groupobs,
joinobs,

# primitive containers
FileDataset,
TableDataset,

# utilities
isimagefile,
loadfile,
filename,

# datasets
DATASETS,
loadtaskdata,
datasetpath





end # module
49 changes: 49 additions & 0 deletions src/FastAI/containers.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@


# FileDataset

struct FileDataset
tree::FileTree
nodes::Vector{FileTrees.File}
end

function FileDataset(args...; kwargs...)
tree = FileTree(args...; kwargs...)
return FileDataset(tree, FileTrees.files(tree))
end

Base.show(io::IO, data::FileDataset) = print(
io,
"FileDataset(\"", data.tree.name, "\", ", nobs(data), " observations)")

LearnBase.nobs(ds::FileDataset) = length(ds.nodes)
LearnBase.getobs(ds::FileDataset, idx::Int) = path(ds.nodes[idx])


# File utilities

"""
loadfile(file)

Load a file from disk into the appropriate format.
"""
function loadfile(file::String)
if isimagefile(file)
# faster image loading
return FileIO.load(file, view = true)
else
return FileIO.load(file)
end
end

loadfile(file::AbstractPath) = loadfile(string(file))
loadfile(file::FileTrees.File) = loadfile(path(file))


isimagefile(file::AbstractPath) = isimagefile(string(file))
isimagefile(file::File) = isimagefile(file.name)
isimagefile(file::String) = occursin(IMAGEFILE_REGEX, lowercase(file))
const IMAGEFILE_REGEX = r"\.(gif|jpe?g|tiff?|png|webp|bmp)$"


## TODO: TableDataset
126 changes: 126 additions & 0 deletions src/FastAI/fastaidatasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
struct FastAIDataset
name
subfolder
extension
description
checksum
datadepname
size
end

const ROOT_URL = "https://s3.amazonaws.com/fast-ai-"

function FastAIDataset(
name, subfolder, checksum = ""; extension = "tgz", description = "", datadepname = name, size = "???")
return FastAIDataset(name, subfolder, extension, description, checksum, datadepname, size)
end


const DATASETCONFIGS = [
# imageclas
FastAIDataset("CUB_200_2011", "imageclas"),
FastAIDataset("bedroom", "imageclas"),
FastAIDataset("caltech_101", "imageclas"),
FastAIDataset("cifar10", "imageclas", "637c5814e11aefcb6ee76d5f59c67ddc8de7f5b5077502a195b0833d1e3e4441"),
FastAIDataset("cifar100", "imageclas", "085ac613ceb0b3659c8072143ae553d5dd146b3c4206c3672a56ed02d0e77d28"),
FastAIDataset("food-101", "imageclas"),
FastAIDataset("imagenette-160", "imageclas"),
FastAIDataset("imagenette-320", "imageclas"),
FastAIDataset("imagenette", "imageclas"),
FastAIDataset("imagenette2-160", "imageclas", "88daccb09b6fce93f45e6c09ddeb269cce705549e6bff322092a2a5a11489863"),
FastAIDataset("imagenette2-320", "imageclas", "569b4497c98db6dd29f335d1f109cf315fe127053cedf69010d047f0188e158c"),
FastAIDataset("imagenette2", "imageclas"),
FastAIDataset("imagewang-160", "imageclas"),
FastAIDataset("imagewang-320", "imageclas"),
FastAIDataset("imagewang", "imageclas"),
FastAIDataset("imagewoof-160", "imageclas"),
FastAIDataset("imagewoof-320", "imageclas"),
FastAIDataset("imagewoof", "imageclas"),
FastAIDataset("imagewoof2-160", "imageclas", "663c22f69c2802d85e2a67103c017e047096702ffddf9149a14011b7002539bf"),
FastAIDataset("imagewoof2-320", "imageclas"),
FastAIDataset("imagewoof2", "imageclas"),
FastAIDataset("mnist_png", "imageclas", "9e18edaa3a08b065d8f80a019ca04329e6d9b3e391363414a9bd1ada30563672"),
FastAIDataset("mnist_var_size_tiny", "imageclas", "8a0f6ca04c2d31810dc08e739c7fa9b612e236383f70dd9fc6e5a62e672e2283"),
FastAIDataset("oxford-102-flowers", "imageclas"),
FastAIDataset("oxford-iiit-pet", "imageclas"),
FastAIDataset("stanford-cars", "imageclas"),

# nlp
FastAIDataset("ag_news_csv", "nlp"),
FastAIDataset("amazon_review_full_csv", "nlp"),
FastAIDataset("amazon_review_polarity_csv", "nlp"),
FastAIDataset("dbpedia_csv", "nlp"),
FastAIDataset("giga-fren", "nlp"),
FastAIDataset("imdb", "nlp"),
FastAIDataset("sogou_news_csv", "nlp"),
FastAIDataset("wikitext-103", "nlp"),
FastAIDataset("wikitext-2", "nlp"),
FastAIDataset("yahoo_answers_csv", "nlp"),
FastAIDataset("yelp_review_full_csv", "nlp"),
FastAIDataset("yelp_review_polarity_csv", "nlp"),

# imagelocal
FastAIDataset("biwi_head_pose", "imagelocal"),
FastAIDataset("camvid", "imagelocal"),
FastAIDataset("pascal-voc", "imagelocal"),
FastAIDataset("pascal_2007", "imagelocal"),
FastAIDataset("pascal_2012", "imagelocal"),
FastAIDataset("siim_small", "imagelocal"),
FastAIDataset("skin-lesion", "imagelocal"),
FastAIDataset("tcga-small", "imagelocal"),

# sample
FastAIDataset("adult_sample", "sample"),
FastAIDataset("biwi_sample", "sample"),
FastAIDataset("camvid_tiny", "sample", "cd42a9bdd8ad3e0ce87179749beae05b4beb1ae6ab665841180b1d8022fc230b"),
FastAIDataset("dogscats", "sample"),
FastAIDataset("human_numbers", "sample"),
FastAIDataset("imdb_sample", "sample"),
FastAIDataset("mnist_sample", "sample"),
FastAIDataset("mnist_tiny", "sample"),
FastAIDataset("movie_lens_sample", "sample"),
FastAIDataset("planet_sample", "sample"),
FastAIDataset("planet_tiny", "sample"),

# coco
FastAIDataset("coco_sample", "coco", "56960c0ac09ff35cd8588823d37e1ed0954cb88b8bfbd214a7763e72f982911c", size = "3GB"),
FastAIDataset("train2017", "coco", datadepname="coco-train2017", extension="zip"),
FastAIDataset("val2017", "coco", datadepname="coco-val2017", extension="zip"),
FastAIDataset("test2017", "coco", datadepname="coco-test2017", extension="zip"),
FastAIDataset("unlabeled2017", "coco", datadepname="coco-unlabeled2017", extension="zip"),
FastAIDataset("image_info_test2017", "coco", datadepname="coco-image_info_test2017", extension="zip"),
FastAIDataset("image_info_unlabeled2017", "coco", datadepname="coco-image_info_unlabeled2017", extension="zip"),
FastAIDataset("annotations_trainval2017", "coco", datadepname="coco-annotations_trainval2017", extension="zip"),
FastAIDataset("stuff_annotations_trainval2017", "coco", datadepname="coco-stuff_annotations_trainval2017", extension="zip"),
FastAIDataset("panoptic_annotations_trainval2017", "coco", datadepname="coco-panoptic_annotations_trainval2017", extension="zip"),
]

const DATASETS = [d.datadepname for d in DATASETCONFIGS]
const DATASETS_IMAGECLASSIFICATION = vcat(
[d.datadepname for d in DATASETCONFIGS if d.subfolder == "imageclas"],
["mnist_sample", "mnist_tiny", "dogscats"],

)


function DataDeps.DataDep(d::FastAIDataset)
return DataDep(
"fastai-$(d.datadepname)",
"""
"$(d.name)" from the fastai dataset repository (https://course.fast.ai/datasets)

$(d.description)

Download size: $(d.size)
""",
"$(ROOT_URL)$(d.subfolder)/$(d.name).$(d.extension)",
d.checksum,
post_fetch_method = DataDeps.unpack,
)
end

function initdatadeps()
for d in DATASETCONFIGS
DataDeps.register(DataDep(d))
end
end
121 changes: 121 additions & 0 deletions src/FastAI/load.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@

"""
datasetpath(name)

Return the folder that dataset `name` is stored.

If it hasn't been downloaded yet, you will be asked if you want to
download it. See [`Datasets.DATASETS`](#) for a list of available datasets.
"""
function datasetpath(name)
datadeppath = @datadep_str "fastai-$name"
return Path(joinpath(datadeppath, name))
end


# Classification datasets

"""
loaddataclassification(dir; split = false)

Load a data container for image classification with observations
`(input = image, target = class)`.

If `split` is `true`, returns a tuple of the data containers split by
the name of the grandparent folder.

`dir` should contain the data in the following canonical format:

- dir
- split (e.g. "train", "valid"...)
- class (e.g. "cat", "dog"...)
- image32434.{jpg/png/...}
- ...
- ...
- ...
"""
function loaddataclassification(
dir,
split=false,
filterparent= (!=("test")),
kwargs...)
data = filterobs(FileDataset(dir)) do path
isimagefile(path) && filterparent(filename(parent(path)))
end
if split
datas = groupobs(data) do path
filename(parent(parent(path)))
end
return Tuple(mapobs(
(input = loadfile, target = path -> filename(parent(path))),
data
) for data in datas)
else
return mapobs(
(input = loadfile, target = path -> filename(parent(path))),
data
)
end
end


"""
getclassesclassification(dir::AbstractPath)
getclassesclassification(name::String)

Get the list of classes for classification dataset saved in `dir`.

Corresponds to all unique names of parent folders that contain images.

"""
function getclassesclassification(dir)
data = mapobs(filterobs(isimagefile, FileDataset(dir))) do path
return filename(parent(path))
end
return unique(collect(eachobsparallel(data, useprimary=true, buffered=false)))
end
getclassesclassification(name::String) = getclassesclassification(datasetpath(name))

# Segmentation datasets

"""
loaddatasegmentation(dir; split = false)

Load a data container for image segmentation with observations
`(input = image, target = mask)`.

If `split` is `true`, returns a tuple of the data containers split by
the name of the grandparent folder.
"""
function loaddatasegmentation(
dir;
split=false,
kwargs...)
imagedata = mapobs(loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "images"))))
maskdata = mapobs(maskfromimage ∘ loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "labels"))))
return mapobs((input = obs -> obs[1], target = obs -> obs[2]), (imagedata, maskdata))
end


"""
getclassessegmentation(dir::AbstractPath)
getclassessegmentation(name::String)

Get the list of classes for segmentation dataset saved in `dir`.

Should be saved as a new-line delimited file called "codes.txt" in `dir`.
"""
function getclassessegmentation(dir::AbstractPath)
classes = readlines(open(joinpath(dir, "codes.txt")))
return classes
end
getclassessegmentation(name::String) = getclassessegmentation(datasetpath(name))




maskfromimage(a::AbstractArray{<:Gray{T}}) where T = maskfromimage(reinterpret(T, a))
maskfromimage(a::AbstractArray{<:Normed{T}}) where T = maskfromimage(reinterpret(T, a))
function maskfromimage(a::AbstractArray{I}) where {I<:Integer}
return a .+ one(I)
end
Loading