JuliaML · lorenzoh · Feb 25, 2021
diff --git a/Project.toml b/Project.toml
@@ -5,11 +5,17 @@ version = "0.5.5"
 [deps]
 BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
+Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
+FileTrees = "72696420-646e-6120-6e77-6f6420746567"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
+LearnBase = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]

diff --git a/src/FastAI/FastAIDatasets.jl b/src/FastAI/FastAIDatasets.jl
@@ -0,0 +1,57 @@
+module FastAIDatasets
+
+
+
+using DataDeps
+using FilePathsBase
+using FilePathsBase: filename
+import FileIO
+using FileTrees
+using MLDataPattern
+using MLDataPattern: splitobs
+import LearnBase
+using Colors
+using FixedPointNumbers
+
+include("fastaidatasets.jl")
+
+function __init__()
+    initdatadeps()
+end
+
+include("containers.jl")
+include("transformations.jl")
+include("load.jl")
+
+
+export
+    # reexports from MLDataPattern
+    splitobs,
+    getobs,
+    nobs,
+
+    # container transformations
+    mapobs,
+    filterobs,
+    groupobs,
+    joinobs,
+
+    # primitive containers
+    FileDataset,
+    TableDataset,
+
+    # utilities
+    isimagefile,
+    loadfile,
+    filename,
+
+    # datasets
+    DATASETS,
+    loadtaskdata,
+    datasetpath
+
+
+
+
+
+end  # module
diff --git a/src/FastAI/containers.jl b/src/FastAI/containers.jl
@@ -0,0 +1,49 @@
+
+
+# FileDataset
+
+struct FileDataset
+    tree::FileTree
+    nodes::Vector{FileTrees.File}
+end
+
+function FileDataset(args...; kwargs...)
+    tree = FileTree(args...; kwargs...)
+    return FileDataset(tree, FileTrees.files(tree))
+end
+
+Base.show(io::IO, data::FileDataset) = print(
+    io,
+    "FileDataset(\"", data.tree.name, "\", ", nobs(data), " observations)")
+
+LearnBase.nobs(ds::FileDataset) = length(ds.nodes)
+LearnBase.getobs(ds::FileDataset, idx::Int) = path(ds.nodes[idx])
+
+
+# File utilities
+
+"""
+    loadfile(file)
+
+Load a file from disk into the appropriate format.
+"""
+function loadfile(file::String)
+    if isimagefile(file)
+        # faster image loading
+        return FileIO.load(file, view = true)
+    else
+        return FileIO.load(file)
+    end
+end
+
+loadfile(file::AbstractPath) = loadfile(string(file))
+loadfile(file::FileTrees.File) = loadfile(path(file))
+
+
+isimagefile(file::AbstractPath) = isimagefile(string(file))
+isimagefile(file::File) = isimagefile(file.name)
+isimagefile(file::String) = occursin(IMAGEFILE_REGEX, lowercase(file))
+const IMAGEFILE_REGEX = r"\.(gif|jpe?g|tiff?|png|webp|bmp)$"
+
+
+## TODO: TableDataset
diff --git a/src/FastAI/fastaidatasets.jl b/src/FastAI/fastaidatasets.jl
@@ -0,0 +1,126 @@
+struct FastAIDataset
+    name
+    subfolder
+    extension
+    description
+    checksum
+    datadepname
+    size
+end
+
+const ROOT_URL = "https://s3.amazonaws.com/fast-ai-"
+
+function FastAIDataset(
+        name, subfolder, checksum = ""; extension = "tgz", description = "", datadepname = name, size = "???")
+    return FastAIDataset(name, subfolder, extension, description, checksum, datadepname, size)
+end
+
+
+const DATASETCONFIGS = [
+    # imageclas
+    FastAIDataset("CUB_200_2011", "imageclas"),
+    FastAIDataset("bedroom", "imageclas"),
+    FastAIDataset("caltech_101", "imageclas"),
+    FastAIDataset("cifar10", "imageclas", "637c5814e11aefcb6ee76d5f59c67ddc8de7f5b5077502a195b0833d1e3e4441"),
+    FastAIDataset("cifar100", "imageclas", "085ac613ceb0b3659c8072143ae553d5dd146b3c4206c3672a56ed02d0e77d28"),
+    FastAIDataset("food-101", "imageclas"),
+    FastAIDataset("imagenette-160", "imageclas"),
+    FastAIDataset("imagenette-320", "imageclas"),
+    FastAIDataset("imagenette", "imageclas"),
+    FastAIDataset("imagenette2-160", "imageclas", "88daccb09b6fce93f45e6c09ddeb269cce705549e6bff322092a2a5a11489863"),
+    FastAIDataset("imagenette2-320", "imageclas", "569b4497c98db6dd29f335d1f109cf315fe127053cedf69010d047f0188e158c"),
+    FastAIDataset("imagenette2", "imageclas"),
+    FastAIDataset("imagewang-160", "imageclas"),
+    FastAIDataset("imagewang-320", "imageclas"),
+    FastAIDataset("imagewang", "imageclas"),
+    FastAIDataset("imagewoof-160", "imageclas"),
+    FastAIDataset("imagewoof-320", "imageclas"),
+    FastAIDataset("imagewoof", "imageclas"),
+    FastAIDataset("imagewoof2-160", "imageclas", "663c22f69c2802d85e2a67103c017e047096702ffddf9149a14011b7002539bf"),
+    FastAIDataset("imagewoof2-320", "imageclas"),
+    FastAIDataset("imagewoof2", "imageclas"),
+    FastAIDataset("mnist_png", "imageclas", "9e18edaa3a08b065d8f80a019ca04329e6d9b3e391363414a9bd1ada30563672"),
+    FastAIDataset("mnist_var_size_tiny", "imageclas", "8a0f6ca04c2d31810dc08e739c7fa9b612e236383f70dd9fc6e5a62e672e2283"),
+    FastAIDataset("oxford-102-flowers", "imageclas"),
+    FastAIDataset("oxford-iiit-pet", "imageclas"),
+    FastAIDataset("stanford-cars", "imageclas"),
+
+    # nlp
+    FastAIDataset("ag_news_csv", "nlp"),
+    FastAIDataset("amazon_review_full_csv", "nlp"),
+    FastAIDataset("amazon_review_polarity_csv", "nlp"),
+    FastAIDataset("dbpedia_csv", "nlp"),
+    FastAIDataset("giga-fren", "nlp"),
+    FastAIDataset("imdb", "nlp"),
+    FastAIDataset("sogou_news_csv", "nlp"),
+    FastAIDataset("wikitext-103", "nlp"),
+    FastAIDataset("wikitext-2", "nlp"),
+    FastAIDataset("yahoo_answers_csv", "nlp"),
+    FastAIDataset("yelp_review_full_csv", "nlp"),
+    FastAIDataset("yelp_review_polarity_csv", "nlp"),
+
+    # imagelocal
+    FastAIDataset("biwi_head_pose", "imagelocal"),
+    FastAIDataset("camvid", "imagelocal"),
+    FastAIDataset("pascal-voc", "imagelocal"),
+    FastAIDataset("pascal_2007", "imagelocal"),
+    FastAIDataset("pascal_2012", "imagelocal"),
+    FastAIDataset("siim_small", "imagelocal"),
+    FastAIDataset("skin-lesion", "imagelocal"),
+    FastAIDataset("tcga-small", "imagelocal"),
+
+    # sample
+    FastAIDataset("adult_sample", "sample"),
+    FastAIDataset("biwi_sample", "sample"),
+    FastAIDataset("camvid_tiny", "sample", "cd42a9bdd8ad3e0ce87179749beae05b4beb1ae6ab665841180b1d8022fc230b"),
+    FastAIDataset("dogscats", "sample"),
+    FastAIDataset("human_numbers", "sample"),
+    FastAIDataset("imdb_sample", "sample"),
+    FastAIDataset("mnist_sample", "sample"),
+    FastAIDataset("mnist_tiny", "sample"),
+    FastAIDataset("movie_lens_sample", "sample"),
+    FastAIDataset("planet_sample", "sample"),
+    FastAIDataset("planet_tiny", "sample"),
+
+    # coco
+    FastAIDataset("coco_sample", "coco", "56960c0ac09ff35cd8588823d37e1ed0954cb88b8bfbd214a7763e72f982911c", size = "3GB"),
+    FastAIDataset("train2017", "coco", datadepname="coco-train2017", extension="zip"),
+    FastAIDataset("val2017", "coco", datadepname="coco-val2017", extension="zip"),
+    FastAIDataset("test2017", "coco", datadepname="coco-test2017", extension="zip"),
+    FastAIDataset("unlabeled2017", "coco", datadepname="coco-unlabeled2017", extension="zip"),
+    FastAIDataset("image_info_test2017", "coco", datadepname="coco-image_info_test2017", extension="zip"),
+    FastAIDataset("image_info_unlabeled2017", "coco", datadepname="coco-image_info_unlabeled2017", extension="zip"),
+    FastAIDataset("annotations_trainval2017", "coco", datadepname="coco-annotations_trainval2017", extension="zip"),
+    FastAIDataset("stuff_annotations_trainval2017", "coco", datadepname="coco-stuff_annotations_trainval2017", extension="zip"),
+    FastAIDataset("panoptic_annotations_trainval2017", "coco", datadepname="coco-panoptic_annotations_trainval2017", extension="zip"),
+]
+
+const DATASETS = [d.datadepname for d in DATASETCONFIGS]
+const DATASETS_IMAGECLASSIFICATION = vcat(
+    [d.datadepname for d in DATASETCONFIGS if d.subfolder == "imageclas"],
+    ["mnist_sample", "mnist_tiny", "dogscats"],
+
+)
+
+
+function DataDeps.DataDep(d::FastAIDataset)
+    return DataDep(
+        "fastai-$(d.datadepname)",
+        """
+        "$(d.name)" from the fastai dataset repository (https://course.fast.ai/datasets)
+
+        $(d.description)
+
+        Download size: $(d.size)
+        """,
+        "$(ROOT_URL)$(d.subfolder)/$(d.name).$(d.extension)",
+        d.checksum,
+        post_fetch_method = DataDeps.unpack,
+    )
+end
+
+function initdatadeps()
+    for d in DATASETCONFIGS
+        DataDeps.register(DataDep(d))
+    end
+end
diff --git a/src/FastAI/load.jl b/src/FastAI/load.jl
@@ -0,0 +1,121 @@
+
+"""
+    datasetpath(name)
+
+Return the folder that dataset `name` is stored.
+
+If it hasn't been downloaded yet, you will be asked if you want to
+download it. See [`Datasets.DATASETS`](#) for a list of available datasets.
+"""
+function datasetpath(name)
+    datadeppath = @datadep_str "fastai-$name"
+    return Path(joinpath(datadeppath, name))
+end
+
+
+# Classification datasets
+
+"""
+     loaddataclassification(dir; split = false)
+
+Load a data container for image classification with observations
+`(input = image, target = class)`.
+
+If `split` is `true`, returns a tuple of the data containers split by
+the name of the grandparent folder.
+
+`dir` should contain the data in the following canonical format:
+
+- dir
+    - split (e.g. "train", "valid"...)
+        - class (e.g. "cat", "dog"...)
+            - image32434.{jpg/png/...}
+            - ...
+        - ...
+    - ...
+"""
+function loaddataclassification(
+        dir,
+        split=false,
+        filterparent= (!=("test")),
+        kwargs...)
+    data = filterobs(FileDataset(dir)) do path
+        isimagefile(path) && filterparent(filename(parent(path)))
+    end
+    if split
+        datas = groupobs(data) do path
+            filename(parent(parent(path)))
+        end
+        return Tuple(mapobs(
+            (input = loadfile, target = path -> filename(parent(path))),
+            data
+        ) for data in datas)
+    else
+        return mapobs(
+            (input = loadfile, target = path -> filename(parent(path))),
+            data
+        )
+    end
+end
+
+
+"""
+    getclassesclassification(dir::AbstractPath)
+    getclassesclassification(name::String)
+
+Get the list of classes for classification dataset saved in `dir`.
+
+Corresponds to all unique names of parent folders that contain images.
+
+"""
+function getclassesclassification(dir)
+    data = mapobs(filterobs(isimagefile, FileDataset(dir))) do path
+        return filename(parent(path))
+    end
+    return unique(collect(eachobsparallel(data, useprimary=true, buffered=false)))
+end
+getclassesclassification(name::String) = getclassesclassification(datasetpath(name))
+
+# Segmentation datasets
+
+"""
+    loaddatasegmentation(dir; split = false)
+
+Load a data container for image segmentation with observations
+`(input = image, target = mask)`.
+
+If `split` is `true`, returns a tuple of the data containers split by
+the name of the grandparent folder.
+"""
+function loaddatasegmentation(
+        dir;
+        split=false,
+        kwargs...)
+    imagedata = mapobs(loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "images"))))
+    maskdata = mapobs(maskfromimage ∘ loadfile, filterobs(isimagefile, FileDataset(joinpath(dir, "labels"))))
+    return mapobs((input = obs -> obs[1], target = obs -> obs[2]), (imagedata, maskdata))
+end
+
+
+"""
+    getclassessegmentation(dir::AbstractPath)
+    getclassessegmentation(name::String)
+
+Get the list of classes for segmentation dataset saved in `dir`.
+
+Should be saved as a new-line delimited file called "codes.txt" in `dir`.
+"""
+function getclassessegmentation(dir::AbstractPath)
+    classes = readlines(open(joinpath(dir, "codes.txt")))
+    return classes
+end
+getclassessegmentation(name::String) = getclassessegmentation(datasetpath(name))
+
+
+
+
+maskfromimage(a::AbstractArray{<:Gray{T}}) where T = maskfromimage(reinterpret(T, a))
+maskfromimage(a::AbstractArray{<:Normed{T}}) where T = maskfromimage(reinterpret(T, a))
+function maskfromimage(a::AbstractArray{I}) where {I<:Integer}
+    return a .+ one(I)
+end