From 00bb922722680e065117cff612e86ecaf289a3f6 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 31 Aug 2021 21:51:57 +0200 Subject: [PATCH 1/4] save downloaded arff file as artifact --- Project.toml | 1 + src/OpenML.jl | 1 + src/data.jl | 18 +++++++++++------- test/data.jl | 9 +++++++++ 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index a99ef16..d3a525f 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" [compat] diff --git a/src/OpenML.jl b/src/OpenML.jl index 4a915e7..ad03dc4 100644 --- a/src/OpenML.jl +++ b/src/OpenML.jl @@ -5,6 +5,7 @@ using JSON import ARFFFiles import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype using Markdown +using Pkg.Artifacts export OpenML diff --git a/src/data.jl b/src/data.jl index 572039d..7ce5c19 100644 --- a/src/data.jl +++ b/src/data.jl @@ -5,10 +5,6 @@ const API_URL = "https://www.openml.org/api/v1/json" # https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd # https://www.openml.org/api_docs#!/data/get_data_id -# TODO: -# - Use e.g. DataDeps to cache data locally -# - Put the ARFF parser to a separate package or use ARFFFiles when -# https://github.com/cjdoris/ARFFFiles.jl/issues/4 is fixed. """ Returns information about a dataset. The information includes the name, @@ -58,9 +54,17 @@ df = DataFrame(table); ``` """ function load(id::Int; parser = :arff) - response = load_Dataset_Description(id) - arff_file = HTTP.request("GET", response["data_set_description"]["url"]) - data = ARFFFiles.load(IOBuffer(arff_file.body)) + dir = first(Artifacts.artifacts_dirs()) + toml = joinpath(dir, "OpenMLArtifacts.toml") + hash = artifact_hash(string(id), toml) + if hash === nothing || !artifact_exists(hash) + hash = Artifacts.create_artifact() do artifact_dir + url = load_Dataset_Description(id)["data_set_description"]["url"] + download(url, joinpath(artifact_dir, "$id.arff")) + end + bind_artifact!(toml, string(id), hash) + end + data = ARFFFiles.load(joinpath(artifact_path(hash), "$id.arff")) if parser == :auto return coerce(data, autotype(data)) else diff --git a/test/data.jl b/test/data.jl index 1da9430..b4eec2c 100644 --- a/test/data.jl +++ b/test/data.jl @@ -4,6 +4,7 @@ using Test using HTTP using OpenML import Tables.istable +using Pkg.Artifacts response_test = OpenML.load_Dataset_Description(61) ntp_test = OpenML.load(61) @@ -40,5 +41,13 @@ end @test length(filters_test["data"]["dataset"][1]) == offset end +@testset "artifacts" begin + dir = first(Artifacts.artifacts_dirs()) + toml = joinpath(dir, "OpenMLArtifacts.toml") + hash = artifact_hash("61", toml) + @test artifact_exists(hash) +end + + end true From 8b3d15b81c2b6d19414c55ef023053fac8f909df Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Wed, 1 Sep 2021 16:10:20 +0200 Subject: [PATCH 2/4] compat with v1.0 --- src/OpenML.jl | 4 +++- src/data.jl | 25 ++++++++++++++++--------- test/data.jl | 15 ++++++++------- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/OpenML.jl b/src/OpenML.jl index ad03dc4..ae0b1c3 100644 --- a/src/OpenML.jl +++ b/src/OpenML.jl @@ -5,7 +5,9 @@ using JSON import ARFFFiles import ScientificTypes: Continuous, Count, Textual, Multiclass, coerce, autotype using Markdown -using Pkg.Artifacts +if VERSION > v"1.3.0" + using Pkg.Artifacts +end export OpenML diff --git a/src/data.jl b/src/data.jl index 7ce5c19..9bee2b3 100644 --- a/src/data.jl +++ b/src/data.jl @@ -54,17 +54,24 @@ df = DataFrame(table); ``` """ function load(id::Int; parser = :arff) - dir = first(Artifacts.artifacts_dirs()) - toml = joinpath(dir, "OpenMLArtifacts.toml") - hash = artifact_hash(string(id), toml) - if hash === nothing || !artifact_exists(hash) - hash = Artifacts.create_artifact() do artifact_dir - url = load_Dataset_Description(id)["data_set_description"]["url"] - download(url, joinpath(artifact_dir, "$id.arff")) + if VERSION > v"1.3.0" + dir = first(Artifacts.artifacts_dirs()) + toml = joinpath(dir, "OpenMLArtifacts.toml") + hash = artifact_hash(string(id), toml) + if hash === nothing || !artifact_exists(hash) + hash = Artifacts.create_artifact() do artifact_dir + url = load_Dataset_Description(id)["data_set_description"]["url"] + download(url, joinpath(artifact_dir, "$id.arff")) + end + bind_artifact!(toml, string(id), hash) end - bind_artifact!(toml, string(id), hash) + filename = joinpath(artifact_path(hash), "$id.arff") + else + url = load_Dataset_Description(id)["data_set_description"]["url"] + filename = tempname() + download(url, filename) end - data = ARFFFiles.load(joinpath(artifact_path(hash), "$id.arff")) + data = ARFFFiles.load(filename) if parser == :auto return coerce(data, autotype(data)) else diff --git a/test/data.jl b/test/data.jl index b4eec2c..f77b155 100644 --- a/test/data.jl +++ b/test/data.jl @@ -4,7 +4,6 @@ using Test using HTTP using OpenML import Tables.istable -using Pkg.Artifacts response_test = OpenML.load_Dataset_Description(61) ntp_test = OpenML.load(61) @@ -41,13 +40,15 @@ end @test length(filters_test["data"]["dataset"][1]) == offset end -@testset "artifacts" begin - dir = first(Artifacts.artifacts_dirs()) - toml = joinpath(dir, "OpenMLArtifacts.toml") - hash = artifact_hash("61", toml) - @test artifact_exists(hash) +if VERSION > v"1.3.0" + using Pkg.Artifacts + @testset "artifacts" begin + dir = first(Artifacts.artifacts_dirs()) + toml = joinpath(dir, "OpenMLArtifacts.toml") + hash = artifact_hash("61", toml) + @test artifact_exists(hash) + end end - end true From 965b330539701f74eb9f7bf2a0d527f74a091a94 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 9 Sep 2021 08:34:34 +1200 Subject: [PATCH 3/4] update doc string for load --- src/data.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data.jl b/src/data.jl index 9bee2b3..6065896 100644 --- a/src/data.jl +++ b/src/data.jl @@ -43,6 +43,8 @@ With `parser = :arff` (default) the ARFFFiles.jl parser is used. With `parser = :auto` the output of the ARFFFiles parser is coerced to automatically detected scientific types. +Datasets are saved as julia artifacts so that they persist locally once loaded. + Returns a table. # Examples From 1a11b2a4aa761ff029e721e45d748c6282b750d6 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 9 Sep 2021 08:35:07 +1200 Subject: [PATCH 4/4] bump version = "0.1.1" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d3a525f..c1f01ff 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.1.0" +version = "0.1.1" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"