From a8f76d5b44afffd86f40be568677d30e4692796f Mon Sep 17 00:00:00 2001 From: lorenzoh Date: Thu, 25 Feb 2021 19:17:17 +0100 Subject: [PATCH] Add FastAI datasets --- Project.toml | 6 ++ src/FastAI/FastAIDatasets.jl | 57 ++++++++++++ src/FastAI/containers.jl | 49 ++++++++++ src/FastAI/fastaidatasets.jl | 126 +++++++++++++++++++++++++ src/FastAI/load.jl | 121 ++++++++++++++++++++++++ src/FastAI/transformations.jl | 168 ++++++++++++++++++++++++++++++++++ src/MLDatasets.jl | 1 + test/runtests.jl | 1 + test/tst_fastai_datasets.jl | 48 ++++++++++ 9 files changed, 577 insertions(+) create mode 100644 src/FastAI/FastAIDatasets.jl create mode 100644 src/FastAI/containers.jl create mode 100644 src/FastAI/fastaidatasets.jl create mode 100644 src/FastAI/load.jl create mode 100644 src/FastAI/transformations.jl create mode 100644 test/tst_fastai_datasets.jl diff --git a/Project.toml b/Project.toml index c1f885dc..38e48e18 100644 --- a/Project.toml +++ b/Project.toml @@ -5,11 +5,17 @@ version = "0.5.5" [deps] BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee" ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" +FileTrees = "72696420-646e-6120-6e77-6f6420746567" FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" +LearnBase = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6" MAT = "23992714-dd62-5051-b70f-ba57cb901cac" +MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b" Requires = "ae029012-a4dd-5104-9daa-d747884805df" [compat] diff --git a/src/FastAI/FastAIDatasets.jl b/src/FastAI/FastAIDatasets.jl new file mode 100644 index 00000000..a58ed41c --- /dev/null +++ b/src/FastAI/FastAIDatasets.jl @@ -0,0 +1,57 @@ +module FastAIDatasets + + + +using DataDeps +using FilePathsBase +using FilePathsBase: filename +import FileIO +using FileTrees +using MLDataPattern +using MLDataPattern: splitobs +import LearnBase +using Colors +using FixedPointNumbers + +include("fastaidatasets.jl") + +function __init__() + initdatadeps() +end + +include("containers.jl") +include("transformations.jl") +include("load.jl") + + +export + # reexports from MLDataPattern + splitobs, + getobs, + nobs, + + # container transformations + mapobs, + filterobs, + groupobs, + joinobs, + + # primitive containers + FileDataset, + TableDataset, + + # utilities + isimagefile, + loadfile, + filename, + + # datasets + DATASETS, + loadtaskdata, + datasetpath + + + + + +end # module diff --git a/src/FastAI/containers.jl b/src/FastAI/containers.jl new file mode 100644 index 00000000..28a0f752 --- /dev/null +++ b/src/FastAI/containers.jl @@ -0,0 +1,49 @@ + + +# FileDataset + +struct FileDataset + tree::FileTree + nodes::Vector{FileTrees.File} +end + +function FileDataset(args...; kwargs...) + tree = FileTree(args...; kwargs...) + return FileDataset(tree, FileTrees.files(tree)) +end + +Base.show(io::IO, data::FileDataset) = print( + io, + "FileDataset(\"", data.tree.name, "\", ", nobs(data), " observations)") + +LearnBase.nobs(ds::FileDataset) = length(ds.nodes) +LearnBase.getobs(ds::FileDataset, idx::Int) = path(ds.nodes[idx]) + + +# File utilities + +""" + loadfile(file) + +Load a file from disk into the appropriate format. +""" +function loadfile(file::String) + if isimagefile(file) + # faster image loading + return FileIO.load(file, view = true) + else + return FileIO.load(file) + end +end + +loadfile(file::AbstractPath) = loadfile(string(file)) +loadfile(file::FileTrees.File) = loadfile(path(file)) + + +isimagefile(file::AbstractPath) = isimagefile(string(file)) +isimagefile(file::File) = isimagefile(file.name) +isimagefile(file::String) = occursin(IMAGEFILE_REGEX, lowercase(file)) +const IMAGEFILE_REGEX = r"\.(gif|jpe?g|tiff?|png|webp|bmp)$" + + +## TODO: TableDataset diff --git a/src/FastAI/fastaidatasets.jl b/src/FastAI/fastaidatasets.jl new file mode 100644 index 00000000..39b61c53 --- /dev/null +++ b/src/FastAI/fastaidatasets.jl @@ -0,0 +1,126 @@ +struct FastAIDataset + name + subfolder + extension + description + checksum + datadepname + size +end + +const ROOT_URL = "https://s3.amazonaws.com/fast-ai-" + +function FastAIDataset( + name, subfolder, checksum = ""; extension = "tgz", description = "", datadepname = name, size = "???") + return FastAIDataset(name, subfolder, extension, description, checksum, datadepname, size) +end + + +const DATASETCONFIGS = [ + # imageclas + FastAIDataset("CUB_200_2011", "imageclas"), + FastAIDataset("bedroom", "imageclas"), + FastAIDataset("caltech_101", "imageclas"), + FastAIDataset("cifar10", "imageclas", "637c5814e11aefcb6ee76d5f59c67ddc8de7f5b5077502a195b0833d1e3e4441"), + FastAIDataset("cifar100", "imageclas", "085ac613ceb0b3659c8072143ae553d5dd146b3c4206c3672a56ed02d0e77d28"), + FastAIDataset("food-101", "imageclas"), + FastAIDataset("imagenette-160", "imageclas"), + FastAIDataset("imagenette-320", "imageclas"), + FastAIDataset("imagenette", "imageclas"), + FastAIDataset("imagenette2-160", "imageclas", "88daccb09b6fce93f45e6c09ddeb269cce705549e6bff322092a2a5a11489863"), + FastAIDataset("imagenette2-320", "imageclas", "569b4497c98db6dd29f335d1f109cf315fe127053cedf69010d047f0188e158c"), + FastAIDataset("imagenette2", "imageclas"), + FastAIDataset("imagewang-160", "imageclas"), + FastAIDataset("imagewang-320", "imageclas"), + FastAIDataset("imagewang", "imageclas"), + FastAIDataset("imagewoof-160", "imageclas"), + FastAIDataset("imagewoof-320", "imageclas"), + FastAIDataset("imagewoof", "imageclas"), + FastAIDataset("imagewoof2-160", "imageclas", "663c22f69c2802d85e2a67103c017e047096702ffddf9149a14011b7002539bf"), + FastAIDataset("imagewoof2-320", "imageclas"), + FastAIDataset("imagewoof2", "imageclas"), + FastAIDataset("mnist_png", "imageclas", "9e18edaa3a08b065d8f80a019ca04329e6d9b3e391363414a9bd1ada30563672"), + FastAIDataset("mnist_var_size_tiny", "imageclas", "8a0f6ca04c2d31810dc08e739c7fa9b612e236383f70dd9fc6e5a62e672e2283"), + FastAIDataset("oxford-102-flowers", "imageclas"), + FastAIDataset("oxford-iiit-pet", "imageclas"), + FastAIDataset("stanford-cars", "imageclas"), + + # nlp + FastAIDataset("ag_news_csv", "nlp"), + FastAIDataset("amazon_review_full_csv", "nlp"), + FastAIDataset("amazon_review_polarity_csv", "nlp"), + FastAIDataset("dbpedia_csv", "nlp"), + FastAIDataset("giga-fren", "nlp"), + FastAIDataset("imdb", "nlp"), + FastAIDataset("sogou_news_csv", "nlp"), + FastAIDataset("wikitext-103", "nlp"), + FastAIDataset("wikitext-2", "nlp"), + FastAIDataset("yahoo_answers_csv", "nlp"), + FastAIDataset("yelp_review_full_csv", "nlp"), + FastAIDataset("yelp_review_polarity_csv", "nlp"), + + # imagelocal + FastAIDataset("biwi_head_pose", "imagelocal"), + FastAIDataset("camvid", "imagelocal"), + FastAIDataset("pascal-voc", "imagelocal"), + FastAIDataset("pascal_2007", "imagelocal"), + FastAIDataset("pascal_2012", "imagelocal"), + FastAIDataset("siim_small", "imagelocal"), + FastAIDataset("skin-lesion", "imagelocal"), + FastAIDataset("tcga-small", "imagelocal"), + + # sample + FastAIDataset("adult_sample", "sample"), + FastAIDataset("biwi_sample", "sample"), + FastAIDataset("camvid_tiny", "sample", "cd42a9bdd8ad3e0ce87179749beae05b4beb1ae6ab665841180b1d8022fc230b"), + FastAIDataset("dogscats", "sample"), + FastAIDataset("human_numbers", "sample"), + FastAIDataset("imdb_sample", "sample"), + FastAIDataset("mnist_sample", "sample"), + FastAIDataset("mnist_tiny", "sample"), + FastAIDataset("movie_lens_sample", "sample"), + FastAIDataset("planet_sample", "sample"), + FastAIDataset("planet_tiny", "sample"), + + # coco + FastAIDataset("coco_sample", "coco", "56960c0ac09ff35cd8588823d37e1ed0954cb88b8bfbd214a7763e72f982911c", size = "3GB"), + FastAIDataset("train2017", "coco", datadepname="coco-train2017", extension="zip"), + FastAIDataset("val2017", "coco", datadepname="coco-val2017", extension="zip"), + FastAIDataset("test2017", "coco", datadepname="coco-test2017", extension="zip"), + FastAIDataset("unlabeled2017", "coco", datadepname="coco-unlabeled2017", extension="zip"), + FastAIDataset("image_info_test2017", "coco", datadepname="coco-image_info_test2017", extension="zip"), + FastAIDataset("image_info_unlabeled2017", "coco", datadepname="coco-image_info_unlabeled2017", extension="zip"), + FastAIDataset("annotations_trainval2017", "coco", datadepname="coco-annotations_trainval2017", extension="zip"), + FastAIDataset("stuff_annotations_trainval2017", "coco", datadepname="coco-stuff_annotations_trainval2017", extension="zip"), + FastAIDataset("panoptic_annotations_trainval2017", "coco", datadepname="coco-panoptic_annotations_trainval2017", extension="zip"), +] + +const DATASETS = [d.datadepname for d in DATASETCONFIGS] +const DATASETS_IMAGECLASSIFICATION = vcat( + [d.datadepname for d in DATASETCONFIGS if d.subfolder == "imageclas"], + ["mnist_sample", "mnist_tiny", "dogscats"], + +) + + +function DataDeps.DataDep(d::FastAIDataset) + return DataDep( + "fastai-$(d.datadepname)", + """ + "$(d.name)" from the fastai dataset repository (https://course.fast.ai/datasets) + + $(d.description) + + Download size: $(d.size) + """, + "$(ROOT_URL)$(d.subfolder)/$(d.name).$(d.extension)", + d.checksum, + post_fetch_method = DataDeps.unpack, + ) +end + +function initdatadeps() + for d in DATASETCONFIGS + DataDeps.register(DataDep(d)) + end +end diff --git a/src/FastAI/load.jl b/src/FastAI/load.jl new file mode 100644 index 00000000..c61ae6b6 --- /dev/null +++ b/src/FastAI/load.jl @@ -0,0 +1,121 @@ + +""" + datasetpath(name) + +Return the folder that dataset `name` is stored. + +If it hasn't been downloaded yet, you will be asked if you want to +download it. See [`Datasets.DATASETS`](#) for a list of available datasets. +""" +function datasetpath(name) + datadeppath = @datadep_str "fastai-$name" + return Path(joinpath(datadeppath, name)) +end + + +# Classification datasets + +""" + loaddataclassification(dir; split = false) + +Load a data container for image classification with observations +`(input = image, target = class)`. + +If `split` is `true`, returns a tuple of the data containers split by +the name of the grandparent folder. + +`dir` should contain the data in the following canonical format: + +- dir + - split (e.g. "train", "valid"...) + - class (e.g. "cat", "dog"...) + - image32434.{jpg/png/...} + - ... + - ... + - ... +""" +function loaddataclassification( + dir, + split=false, + filterparent= (!=("test")), + kwargs...) + data = filterobs(FileDataset(dir)) do path + isimagefile(path) && filterparent(filename(parent(path))) + end + if split + datas = groupobs(data) do path + filename(parent(parent(path))) + end + return Tuple(mapobs( + (input = loadfile, target = path -> filename(parent(path))), + data + ) for data in datas) + else + return mapobs( + (input = loadfile, target = path -> filename(parent(path))), + data + ) + end +end + + +""" + getclassesclassification(dir::AbstractPath) + getclassesclassification(name::String) + +Get the list of classes for classification dataset saved in `dir`. + +Corresponds to all unique names of parent folders that contain images. + +""" +function getclassesclassification(dir) + data = mapobs(filterobs(isimagefile, FileDataset(dir))) do path + return filename(parent(path)) + end + return unique(collect(eachobsparallel(data, useprimary=true, buffered=false))) +end +getclassesclassification(name::String) = getclassesclassification(datasetpath(name)) + +# Segmentation datasets + +""" + loaddatasegmentation(dir; split = false) + +Load a data container for image segmentation with observations +`(input = image, target = mask)`. + +If `split` is `true`, returns a tuple of the data containers split by +the name of the grandparent folder. +""" +function loaddatasegmentation( + dir; + split=false, + kwargs...) + imagedata = mapobs(loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "images")))) + maskdata = mapobs(maskfromimage ∘ loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "labels")))) + return mapobs((input = obs -> obs[1], target = obs -> obs[2]), (imagedata, maskdata)) +end + + +""" + getclassessegmentation(dir::AbstractPath) + getclassessegmentation(name::String) + +Get the list of classes for segmentation dataset saved in `dir`. + +Should be saved as a new-line delimited file called "codes.txt" in `dir`. +""" +function getclassessegmentation(dir::AbstractPath) + classes = readlines(open(joinpath(dir, "codes.txt"))) + return classes +end +getclassessegmentation(name::String) = getclassessegmentation(datasetpath(name)) + + + + +maskfromimage(a::AbstractArray{<:Gray{T}}) where T = maskfromimage(reinterpret(T, a)) +maskfromimage(a::AbstractArray{<:Normed{T}}) where T = maskfromimage(reinterpret(T, a)) +function maskfromimage(a::AbstractArray{I}) where {I<:Integer} + return a .+ one(I) +end diff --git a/src/FastAI/transformations.jl b/src/FastAI/transformations.jl new file mode 100644 index 00000000..54ef3552 --- /dev/null +++ b/src/FastAI/transformations.jl @@ -0,0 +1,168 @@ + +# mapobs + +struct MappedData + f + data +end + +Base.show(io::IO, data::MappedData) = print(io, "mapobs($(data.f), $(data.data))") +LearnBase.nobs(data::MappedData) = nobs(data.data) +LearnBase.getobs(data::MappedData, idx::Int) = data.f(getobs(data.data, idx)) +LearnBase.getobs(data::MappedData, idxs::AbstractVector) = data.f.(getobs(data.data, idxs)) + + +""" + mapobs(f, data) + +Lazily map `f` over the observations in a data container `data`. + +```julia +data = 1:10 +getobs(data, 8) == 8 +mdata = mapobs(-, data) +getobs(mdata, 8) == -8 +``` +""" +mapobs(f, data) = MappedData(f, data) + + +""" + mapobs(fs, data) + +Lazily map each function in tuple `fs` over the observations in data container `data`. +Returns a tuple of transformed data containers. +""" +mapobs(fs::Tuple, data) = Tuple(mapobs(f, data) for f in fs) + + +struct NamedTupleData{TData, F} + data::TData + namedfs::NamedTuple{F} +end + +LearnBase.nobs(data::NamedTupleData) = nobs(getfield(data, :data)) + +function LearnBase.getobs(data::NamedTupleData{TData, F}, idx::Int) where {TData, F} + obs = getobs(getfield(data, :data), idx) + namedfs = getfield(data, :namedfs) + return NamedTuple{F}(f(obs) for f in namedfs) +end + +Base.getproperty(data::NamedTupleData, field::Symbol) = mapobs( + getproperty(getfield(data, :namedfs), field), + getfield(data, :data), +) + +Base.show(io::IO, data::NamedTupleData) = print(io, "mapobs($(getfield(data, :namedfs)), $(getfield(data, :data)))") + +""" + mapobs(namedfs::NamedTuple, data) + +Map a `NamedTuple` of functions over `data`, turning it into a data container +of `NamedTuple`s. Field syntax can be used to select a column of the resulting +data container. + +```julia +data = 1:10 +nameddata = mapobs((x = sqrt, y = log), data) +getobs(nameddata, 10) == (x = sqrt(10), y = log(10)) +getobs(nameddata.x, 10) == sqrt(10) +``` +""" +function mapobs(namedfs::NamedTuple, data) + return NamedTupleData(data, namedfs) +end + +# filterobs + +""" + filterobs(f, data) + +Return a subset of data container `data` including all indices `i` for +which `f(getobs(data, i)) === true`. + +```julia +data = 1:10 +nobs(data) == 10 +fdata = filterobs(>(5), data) +nobs(fdata) == 5 +``` +""" +function filterobs(f, data; iterfn=_iterobs) + return datasubset(data, [i for (i, obs) in enumerate(iterfn(data)) if f(obs)]) +end + +_iterobs(data) = [getobs(data, i) for i in 1:nobs(data)] + + +# groupobs + +""" + groupobs(f, data) + +Split data container data `data` into different data containers, grouping +observations by `f(obs)`. + +```julia +data = -10:10 +datas = groupobs(>(0), data) +length(datas) == 2 +``` +""" +function groupobs(f, data) + groups = Dict{Any,Vector{Int}}() + for i in 1:nobs(data) + group = f(getobs(data, i)) + if !haskey(groups, group) + groups[group] = [i] + else + push!(groups[group], i) + end + end + return Tuple(datasubset(data, groups[group]) + for group in sort(collect(keys(groups)))) +end + +# joinobs + +struct JoinedData{T,N} + datas::NTuple{N,T} + ns::NTuple{N,Int} +end + +JoinedData(datas) = JoinedData(datas, nobs.(datas)) + +LearnBase.nobs(data::JoinedData) = sum(data.ns) +function LearnBase.getobs(data::JoinedData, idx) + for (i, n) in enumerate(data.ns) + if idx <= n + return getobs(data.datas[i], idx) + else + idx -= n + end + end +end + +""" + joinobs(datas...) + +Concatenate data containers `datas`. + +```julia +data1, data2 = 1:10, 11:20 +jdata = joinobs(data1, data2) +getobs(jdata, 15) == 15 +``` +""" +joinobs(datas...) = JoinedData(datas) + + +# TODO: NamedTupleData transformation +# +# mdata = mapobs(data, (col1 = f1, col2 = f2)) +# getobs(mdata, 1) == (col1 = f1(getobs(data, 1)), col2 = f2(getobs(data, 1))) +# getobs(mdata.col1, 1) == f1(getobs(data, 1)) +# +# Useful for datasets where you want to split off the targets, e.g. to avoid loading the +# images. diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl index e816750b..38eb0abc 100644 --- a/src/MLDatasets.jl +++ b/src/MLDatasets.jl @@ -36,6 +36,7 @@ include("SVHN2/SVHN2.jl") include("PTBLM/PTBLM.jl") include("UD_English/UD_English.jl") include("EMNIST/EMNIST.jl") +include("FastAI/FastAIDatasets.jl") function __init__() # initialize optional dependencies diff --git a/test/runtests.jl b/test/runtests.jl index c641bc23..d9f348e3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,7 @@ tests = [ "tst_fashion_mnist.jl", "tst_svhn2.jl", "tst_emnist.jl", + "tst_fastai_datasets.jl" ] for t in tests diff --git a/test/tst_fastai_datasets.jl b/test/tst_fastai_datasets.jl new file mode 100644 index 00000000..c70dfd57 --- /dev/null +++ b/test/tst_fastai_datasets.jl @@ -0,0 +1,48 @@ + +using Test +using MLDatasets +using MLDatasets.FastAIDatasets +using MLDatasets.FastAIDatasets: datasetpath, loaddataclassification, + mapobs, filterobs, groupobs, joinobs, getobs, nobs + +@testset "datasetpath" begin + @test_nowarn FastAIDatasets.datasetpath("mnist_var_size_tiny") +end + +@testset "loaddataset" begin + @test_nowarn loaddataclassification(datasetpath("mnist_var_size_tiny")) +end + + +@testset "Data container transformations" begin + @testset "mapobs" begin + data = 1:10 + mdata = mapobs(-, data) + @test getobs(mdata, 8) == -8 + + mdata2 = mapobs((-, x -> 2x), data) + @test getobs(mdata2, 8) == (-8, 16) + + nameddata = mapobs((x = sqrt, y = log), data) + @test getobs(nameddata, 10) == (x = sqrt(10), y = log(10)) + @test getobs(nameddata.x, 10) == sqrt(10) + end + + @testset "filterobs" begin + data = 1:10 + fdata = filterobs(>(5), data) + @test nobs(fdata) == 5 + end + + @testset "groupobs" begin + data = -10:10 + datas = groupobs(>(0), data) + @test length(datas) == 2 + end + + @testset "joinobs" begin + data1, data2 = 1:10, 11:20 + jdata = joinobs(data1, data2) + @test getobs(jdata, 15) == 15 + end +end