Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port of FastAIDasets #194

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/MLDatasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ export TUDataset
include("datasets/meshes/faust.jl")
export FAUST

include("datasets/fastaidatasets.jl")

function __init__()
# TODO automatically find and execute __init__xxx functions

Expand Down Expand Up @@ -168,6 +170,9 @@ function __init__()

# mesh
__init__faust()

# fastai datasets
__init__fastai_datasets()
end

end #module
219 changes: 219 additions & 0 deletions src/datasets/fastaidatasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
struct FastAIDataset
name::Any
subfolder::Any
extension::Any
description::Any
checksum::Any
datadepname::Any
subpath::Any
size::Any
end

const ROOT_URL_FastAI = "https://s3.amazonaws.com/fast-ai-"

function FastAIDataset(name, subfolder, checksum = "";
extension = "tgz",
description = "",
datadepname = name,
subpath = name,
size = "???")
return FastAIDataset(name, subfolder, extension, description, checksum, datadepname,
subpath, size)
end


const DESCRIPTIONS = Dict(
"imagenette" => "A subset of 10 easily classified classes from Imagenet: tench, English springer, cassette player, chain saw, church, French horn, garbage truck, gas pump, golf ball, parachute",
"imagewoof" => "A subset of 10 harder to classify classes from Imagenet (all dog breeds): Australian terrier, Border terrier, Samoyed, beagle, Shih-Tzu, English foxhound, Rhodesian ridgeback, dingo, golden retriever, Old English sheepdog",
"food-101" => "101 food categories, with 101,000 images; 250 test images and 750 training images per class. The training images were not cleaned. All images were rescaled to have a maximum side length of 512 pixels.",
)

const DATASETCONFIGS = [
# imageclas
FastAIDataset("CUB_200_2011", "imageclas",
"0c685df5597a8b24909f6a7c9db6d11e008733779a671760afef78feb49bf081",
size = "1GiB"),
FastAIDataset("bedroom", "imageclas",
"7c95250ccb177c582f602c08f239c71f7a70512729d2e078925261cf5e349f5d",
size = "4.25GiB"),
FastAIDataset("caltech_101", "imageclas",
"af6ece2f339791ca20f855943d8b55dd60892c0a25105fcd631ee3d6430f9926",
size = "126MiB", subpath = "101_ObjectCategories"),
FastAIDataset("cifar10", "imageclas",
"637c5814e11aefcb6ee76d5f59c67ddc8de7f5b5077502a195b0833d1e3e4441"),
FastAIDataset("cifar100", "imageclas",
"085ac613ceb0b3659c8072143ae553d5dd146b3c4206c3672a56ed02d0e77d28"),
FastAIDataset("food-101", "imageclas",
"abc3d6b03a9886fdea6d2a124cf88e22a99dfdb03085b2478be97de3f8e4679f",
size = "5.3GB", description = DESCRIPTIONS["food-101"]),
FastAIDataset("imagenette-160", "imageclas",
"1bd650bc16884ca88e4f0f537ed8569b1f8d7ae865d37eba8ecdd87d9cd9dcfa",
size = "1.45GiB", description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagenette-320", "imageclas", description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagenette", "imageclas", description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagenette2-160", "imageclas",
"64d0c4859f35a461889e0147755a999a48b49bf38a7e0f9bd27003f10db02fe5",
description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagenette2-320", "imageclas",
"569b4497c98db6dd29f335d1f109cf315fe127053cedf69010d047f0188e158c",
description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagenette2", "imageclas",
"6cbfac238434d89fe99e651496f0812ebc7a10fa62bd42d6874042bf01de4efd",
description = DESCRIPTIONS["imagenette"]),
FastAIDataset("imagewang-160", "imageclas",
"a0d360f9d8159055b3bf2b8926a51d19b2f1ff98a1eef6034e4b891c59ca3f1a",
size = "182MiB"),
FastAIDataset("imagewang-320", "imageclas",
"fd53301c335aa46f0f4add68dd471cd0b8b66412382cc36f5f510d0a03fb4d9d",
size = "639MiB"),
FastAIDataset("imagewang", "imageclas"),
FastAIDataset("imagewoof-160", "imageclas",
"a0d360f9d8159055b3bf2b8926a51d19b2f1ff98a1eef6034e4b891c59ca3f1a",
description = DESCRIPTIONS["imagewoof"]),
FastAIDataset("imagewoof-320", "imageclas", description = DESCRIPTIONS["imagewoof"]),
FastAIDataset("imagewoof", "imageclas", description = DESCRIPTIONS["imagewoof"]),
FastAIDataset("imagewoof2-160", "imageclas",
"b5ffa16037e07f60882434f55b7814a3d44483f2a484129f251604bc0d0f8172",
description = DESCRIPTIONS["imagewoof"]),
FastAIDataset("imagewoof2-320", "imageclas",
"7db6120fdb9ae079e26346f89e7b00d7f184f8137791609b97fd0405d3f92305",
description = DESCRIPTIONS["imagewoof"], size = "313MB"),
FastAIDataset("imagewoof2", "imageclas",
"de3f58c4ea3e042cf3f8365fbc699288cfe1d8c151059040d181c221bd5a55b8",
description = DESCRIPTIONS["imagewoof"], size = "1.25GiB"),
FastAIDataset("mnist_png", "imageclas",
"9e18edaa3a08b065d8f80a019ca04329e6d9b3e391363414a9bd1ada30563672"),
FastAIDataset("mnist_var_size_tiny", "imageclas",
"8a0f6ca04c2d31810dc08e739c7fa9b612e236383f70dd9fc6e5a62e672e2283"),
FastAIDataset("oxford-102-flowers", "imageclas"),
FastAIDataset("oxford-iiit-pet", "imageclas"),
FastAIDataset("stanford-cars", "imageclas"),

# nlp
FastAIDataset("ag_news_csv", "nlp",
"9a8c300eabb45750237fcc669f61cb8a3448f3ef6f6098e1ce340e444f6872be",
size = "11MB"),
FastAIDataset("amazon_review_full_csv", "nlp",
"4af62eeee139d0142e0747340b68646d23483d9475c33ea0641ee9175b423443",
size = "600MB"),
FastAIDataset("amazon_review_polarity_csv", "nlp",
"d2a3ee7a214497a5d1b8eaed7c8d7ba2737de00ada3b0ec46243983efa100361",
size = "600MB"),
FastAIDataset("dbpedia_csv", "nlp",
"42db5221ddedddb673a4cabcc5f3a7d869714c878bcfe4ba94b29d14aa38e417",
size = "65MB"),
FastAIDataset("giga-fren", "nlp",
"11c97af99471fe641f210d8b86ccccf3b298b9199853987ee53892d709d7ca6b",
size = "2.4GB"),
FastAIDataset("imdb", "nlp",
"d501018afa17aee9fa1ebe8ac29859a5609980e13dc6e611aa21567cc357351f",
size = "140MB"),
FastAIDataset("sogou_news_csv", "nlp",
"6b77fc935561d339b82aa552d7e31ea59eff492a494920579b3ce70604efb5c2",
size = "360MB"),
FastAIDataset("wikitext-103", "nlp",
"27b89e94d98a9f9db74588a2e75b04378ee21569ce55d329d3e73e27d0952551",
size = "181MB"),
FastAIDataset("wikitext-2", "nlp",
"4e39df0e84453ae2f3d34333de2a9d8e57560a7a6e621f13e11dc21241320074",
size = "4MB"),
FastAIDataset("yahoo_answers_csv", "nlp",
"2d4277855faf8b35259009425fa8f7fe1888b5644b47165508942d000f4c96ae",
size = "305MB"),
FastAIDataset("yelp_review_full_csv", "nlp",
"56006b0a17a370f1e366504b1f2c3e3754e4a3dda17d3e718a885c552869a559",
size = "187MB"),
FastAIDataset("yelp_review_polarity_csv", "nlp",
"528f22e286cad085948acbc3bea7e58188416546b0e364d0ae4ca0ce666abe35",
size = "158MB"),

# imagelocal
FastAIDataset("biwi_head_pose", "imagelocal",
"9cfefd53ed85f824c5908bc6eb21fc719583eec57a7df1d8141d3156645693cf",
size = "430MiB"),
FastAIDataset("camvid", "imagelocal",
"11db05fc3ee727fb17de7499380b20258a41beeb1002a2aee2c2244a472a4a45",
size = "571MB"),
FastAIDataset("pascal-voc", "imagelocal",
"10fc13a659da20fdd8302dd394d88ca7e4e60e69fd8a5212c3e3357964a58215",
size = "4.3GB"),
FastAIDataset("pascal_2007", "imagelocal"),
FastAIDataset("pascal_2012", "imagelocal"),
FastAIDataset("siim_small", "imagelocal"),
FastAIDataset("skin-lesion", "imagelocal"),
FastAIDataset("tcga-small", "imagelocal"),

# sample
FastAIDataset("adult_sample", "sample",
"47ecd1848abc976643ee82d8788b712e3006d629bbc7554efa1077a91579e99e",
size = "3.8MB"),
FastAIDataset("biwi_sample", "sample"),
FastAIDataset("camvid_tiny", "sample",
"cd42a9bdd8ad3e0ce87179749beae05b4beb1ae6ab665841180b1d8022fc230b"),
FastAIDataset("dogscats", "sample",
"b79c0a5e4aa9ba7a0b83abbf61908c61e15bed0e5b236e86a0c4a080c8f70d7c",
size = "800MiB"),
FastAIDataset("human_numbers", "sample"),
FastAIDataset("imdb_sample", "sample",
"8e776d995296136b3f9a3cf001796d886cb0b60e86877ce71c7abbdc3c247341",
size = "4KB"),
FastAIDataset("mnist_sample", "sample",
"b373a14f282298aeba0f7dd56b7cdb6c2401063d4f118c39c54982907760bd38",
size = "3MB"),
FastAIDataset("mnist_tiny", "sample",
"0d1fedf86243931aa3fc065d2cf4ffab339a972958d8594ae993ee32bd8e15b9",
size = "300KB"),
FastAIDataset("movie_lens_sample", "sample"),
FastAIDataset("planet_sample", "sample",
"f2509212bb2dcdc147423b164564f2e63cae1d1db0b504166e5b92cfbcbb3b4c",
size = "14.8MB"),
FastAIDataset("planet_tiny", "sample",
"41a5fdd82db1c9fb2cff17e1a1270102414a25a34b21b770f953d28483961edb",
size = "1MB"),

# coco
FastAIDataset("coco_sample", "coco", "56960c0ac09ff35cd8588823d37e1ed0954cb88b8bfbd214a7763e72f982911c", size="3GB"),
FastAIDataset("train2017", "coco", datadepname="coco-train2017", extension="zip"),
FastAIDataset("val2017", "coco", datadepname="coco-val2017", extension="zip"),
FastAIDataset("test2017", "coco", datadepname="coco-test2017", extension="zip"),
FastAIDataset("unlabeled2017", "coco", datadepname="coco-unlabeled2017", extension="zip"),
FastAIDataset("image_info_test2017", "coco", datadepname="coco-image_info_test2017", extension="zip"),
FastAIDataset("image_info_unlabeled2017", "coco", datadepname="coco-image_info_unlabeled2017", extension="zip"),
FastAIDataset("annotations_trainval2017", "coco", datadepname="coco-annotations_trainval2017", extension="zip"),
FastAIDataset("stuff_annotations_trainval2017", "coco", datadepname="coco-stuff_annotations_trainval2017", extension="zip"),
FastAIDataset("panoptic_annotations_trainval2017", "coco", datadepname="coco-panoptic_annotations_trainval2017", extension="zip"),
]


"""
fastaidatasets()

List all available FastAI datasets.
"""
fastaidatasets() = [d.datadepname for d in DATASETCONFIGS]


function DataDeps.DataDep(d::FastAIDataset)
return DataDep("fastai-$(d.datadepname)",
"""
"$(d.name)" from the fastai dataset repository (https://course.fast.ai/datasets)
$(d.description)
Download size: $(d.size)
""",
"$(ROOT_URL_FastAI)$(d.subfolder)/$(d.name).$(d.extension)",
d.checksum,
post_fetch_method = f -> begin
DataDeps.unpack(f)
extracted = readdir(pwd())[1]
temp = mktempdir()
mv(extracted, temp, force=true)
mv(temp, pwd(), force=true)
end,
)
end

function __init__fastai_datasets()
for d in DATASETCONFIGS
DataDeps.register(DataDep(d))
end
end