Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first attempt at artifacts #60

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Artifacts.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[MNIST]
git-tree-sha1 = "bca582d83e460d262193b91a8d4eba481ce2d2f1"
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"

[compat]
Expand All @@ -18,7 +20,6 @@ ColorTypes = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
DataDeps = "0.3, 0.4, 0.5, 0.6, 0.7"
FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
GZip = "0.5"
ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
MAT = "0.7, 0.8, 0.9, 0.10"
Requires = "1"
julia = "1"
Expand Down
63 changes: 20 additions & 43 deletions src/MNIST/MNIST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ the 10 possible digits (0-9).
- [`MNIST.convert2image`](@ref)
"""
module MNIST
using DataDeps
using Pkg.Artifacts
using LazyArtifacts
using ColorTypes
using FixedPointNumbers
using ..MLDatasets: bytes_to_type, datafile, download_dep, download_docstring,
_colorview
using ..MLDatasets: bytes_to_type, _colorview

export

Expand All @@ -46,54 +46,31 @@ module MNIST

@deprecate convert2features reshape

const DEPNAME = "MNIST"
const ARTIFACT_NAME = "MNIST"
const TRAINIMAGES = "train-images-idx3-ubyte.gz"
const TRAINLABELS = "train-labels-idx1-ubyte.gz"
const TESTIMAGES = "t10k-images-idx3-ubyte.gz"
const TESTLABELS = "t10k-labels-idx1-ubyte.gz"

"""
download([dir]; [i_accept_the_terms_of_use])

Trigger the (interactive) download of the full dataset into
"`dir`". If no `dir` is provided the dataset will be
downloaded into "~/.julia/datadeps/$DEPNAME".

This function will display an interactive dialog unless
either the keyword parameter `i_accept_the_terms_of_use` or
the environment variable `DATADEPS_ALWAYS_ACCEPT` is set to
`true`. Note that using the data responsibly and respecting
copyright/terms-of-use remains your responsibility.
"""
download(args...; kw...) = download_dep(DEPNAME, args...; kw...)


include(joinpath("Reader","Reader.jl"))
include("interface.jl")
include("utils.jl")

function __init__()
register(DataDep(
DEPNAME,
"""
Dataset: THE MNIST DATABASE of handwritten digits
Authors: Yann LeCun, Corinna Cortes, Christopher J.C. Burges
Website: http://yann.lecun.com/exdb/mnist/

[LeCun et al., 1998a]
Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner.
"Gradient-based learning applied to document recognition."
Proceedings of the IEEE, 86(11):2278-2324, November 1998

The files are available for download at the offical
website linked above. Note that using the data
responsibly and respecting copyright remains your
responsibility. The authors of MNIST aren't really
explicit about any terms of use, so please read the
website to make sure you want to download the
dataset.
""",
"https://ossci-datasets.s3.amazonaws.com/mnist/" .* [TRAINIMAGES, TRAINLABELS, TESTIMAGES, TESTLABELS],
"0bb1d5775d852fc5bb32c76ca15a7eb4e9a3b1514a2493f7edfcf49b639d7975",
))

artifact_toml = joinpath(@__DIR__, "..", "..", "Artifacts.toml")
_hash = artifact_hash(ARTIFACT_NAME, artifact_toml)

if _hash === nothing || !artifact_exists(_hash)
_hash = create_artifact() do artifact_dir
url_base = "https://ossci-datasets.s3.amazonaws.com/mnist/"
for file in [TRAINIMAGES, TRAINLABELS,
TESTIMAGES, TESTLABELS]
download("$url_base/$file", joinpath(artifact_dir, file))
end
end
bind_artifact!(artifact_toml, ARTIFACT_NAME, _hash, lazy=true)
end


end
74 changes: 30 additions & 44 deletions src/MNIST/interface.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
traintensor([T = N0f8], [indices]; [dir]) -> Array{T}
traintensor([T = N0f8], [indices]) -> Array{T}

Returns the MNIST **training** images corresponding to the given
`indices` as a multi-dimensional array of eltype `T`.
Expand Down Expand Up @@ -42,21 +42,19 @@ julia> MNIST.convert2image(MNIST.traintensor(1)) # convert to column-major color
28×28 Array{Gray{N0f8},2}:
[...]
```

$(download_docstring("MNIST", DEPNAME))
"""
function traintensor(::Type{T}, args...; dir = nothing) where T
path = datafile(DEPNAME, TRAINIMAGES, dir)
function traintensor(::Type{T}, args...) where T
path = joinpath(artifact"MNIST", TRAINIMAGES)
images = Reader.readimages(path, args...)
bytes_to_type(T, images)
end

function traintensor(args...; dir = nothing)
traintensor(N0f8, args...; dir = dir)
function traintensor(args...)
traintensor(N0f8, args...)
end

"""
testtensor([T = N0f8], [indices]; [dir]) -> Array{T}
testtensor([T = N0f8], [indices]) -> Array{T}

Returns the MNIST **test** images corresponding to the given
`indices` as a multi-dimensional array of eltype `T`.
Expand Down Expand Up @@ -99,21 +97,19 @@ julia> MNIST.convert2image(MNIST.testtensor(1)) # convert to column-major colora
28×28 Array{Gray{N0f8},2}:
[...]
```

$(download_docstring("MNIST", DEPNAME))
"""
function testtensor(::Type{T}, args...; dir = nothing) where T
path = datafile(DEPNAME, TESTIMAGES, dir)
function testtensor(::Type{T}, args...) where T
path = joinpath(artifact"MNIST", TESTIMAGES)
images = Reader.readimages(path, args...)
bytes_to_type(T, images)
end

function testtensor(args...; dir = nothing)
testtensor(N0f8, args...; dir = dir)
function testtensor(args...)
testtensor(N0f8, args...)
end

"""
trainlabels([indices]; [dir])
trainlabels([indices])

Returns the MNIST **trainset** labels corresponding to the given
`indices` as an `Int` or `Vector{Int}`. The values of the labels
Expand All @@ -138,21 +134,19 @@ julia> MNIST.trainlabels(1:3) # first three labels
julia> MNIST.trainlabels(1) # first label
5
```

$(download_docstring("MNIST", DEPNAME))
"""
function trainlabels(args...; dir = nothing)
path = datafile(DEPNAME, TRAINLABELS, dir)
function trainlabels(args...)
path = joinpath(artifact"MNIST", TRAINLABELS)
Vector{Int}(Reader.readlabels(path, args...))
end

function trainlabels(index::Integer; dir = nothing)
path = datafile(DEPNAME, TRAINLABELS, dir)
function trainlabels(index::Integer)
path = joinpath(artifact"MNIST", TRAINLABELS)
Int(Reader.readlabels(path, index))
end

"""
testlabels([indices]; [dir])
testlabels([indices])

Returns the MNIST **testset** labels corresponding to the given
`indices` as an `Int` or `Vector{Int}`. The values of the labels
Expand All @@ -177,21 +171,19 @@ julia> MNIST.testlabels(1:3) # first three labels
julia> MNIST.testlabels(1) # first label
7
```

$(download_docstring("MNIST", DEPNAME))
"""
function testlabels(args...; dir = nothing)
path = datafile(DEPNAME, TESTLABELS, dir)
function testlabels(args...)
path = joinpath(artifact"MNIST", TESTLABELS)
Vector{Int}(Reader.readlabels(path, args...))
end

function testlabels(index::Integer; dir = nothing)
path = datafile(DEPNAME, TESTLABELS, dir)
function testlabels(index::Integer)
path = joinpath(artifact"MNIST", TESTLABELS)
Int(Reader.readlabels(path, index))
end

"""
traindata([T = N0f8], [indices]; [dir]) -> Tuple
traindata([T = N0f8], [indices]) -> Tuple

Returns the MNIST **trainingset** corresponding to the given
`indices` as a two-element tuple. If `indices` is omitted the
Expand All @@ -209,23 +201,20 @@ represent.
```julia
train_x, train_y = MNIST.traindata() # full datatset
train_x, train_y = MNIST.traindata(2) # only second observation
train_x, train_y = MNIST.traindata(dir="./MNIST") # custom folder
```

$(download_docstring("MNIST", DEPNAME))

Take a look at [`MNIST.traintensor`](@ref) and
[`MNIST.trainlabels`](@ref) for more information.
"""
function traindata(::Type{T}, args...; dir = nothing) where T
(traintensor(T, args...; dir = dir),
trainlabels(args...; dir = dir))
function traindata(::Type{T}, args...) where T
(traintensor(T, args...),
trainlabels(args...))
end

traindata(args...; dir = nothing) = traindata(N0f8, args...; dir = dir)
traindata(args...) = traindata(N0f8, args...)

"""
testdata([T = N0f8], [indices]; [dir]) -> Tuple
testdata([T = N0f8], [indices]) -> Tuple

Returns the MNIST **testset** corresponding to the given
`indices` as a two-element tuple. If `indices` is omitted the
Expand All @@ -243,17 +232,14 @@ represent.
```julia
test_x, test_y = MNIST.testdata() # full datatset
test_x, test_y = MNIST.testdata(2) # only second observation
test_x, test_y = MNIST.testdata(dir="./MNIST") # custom folder
```

$(download_docstring("MNIST", DEPNAME))

Take a look at [`MNIST.testtensor`](@ref) and
[`MNIST.testlabels`](@ref) for more information.
"""
function testdata(::Type{T}, args...; dir = nothing) where T
(testtensor(T, args...; dir = dir),
testlabels(args...; dir = dir))
function testdata(::Type{T}, args...) where T
(testtensor(T, args...),
testlabels(args...))
end

testdata(args...; dir = nothing) = testdata(N0f8, args...; dir = dir)
testdata(args...) = testdata(N0f8, args...)