From 766a71c915f9e17e53386d18cda85b715ed6d883 Mon Sep 17 00:00:00 2001 From: "Anthony Blaom, PhD" Date: Fri, 4 Mar 2022 12:46:33 +1300 Subject: [PATCH 01/14] bump 0.2.1 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8ec59b9..87fba18 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.2.0" +version = "0.2.1" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" From f2daf084194075c0e6340b84acb133a5997f448e Mon Sep 17 00:00:00 2001 From: "Anthony Blaom, PhD" Date: Fri, 4 Mar 2022 12:50:36 +1300 Subject: [PATCH 02/14] revert previous commit (ie return version to 0.2.0) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 87fba18..8ec59b9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.2.1" +version = "0.2.0" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" From 56f75c731bf066ce5569e4cc822b223bb6139648 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 09:24:00 +0200 Subject: [PATCH 03/14] improve error messages --- src/data.jl | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/data.jl b/src/data.jl index a68782e..2829424 100644 --- a/src/data.jl +++ b/src/data.jl @@ -14,23 +14,20 @@ information about the creator, URL to download it and more. - 111 - Unknown dataset. Data set description with data_id was not found in the database. - 112 - No access granted. This dataset is not shared with you. """ -function load_Dataset_Description(id::Int; api_key::String="") +function load_Dataset_Description(id::Int) url = string(API_URL, "/data/$id") try r = HTTP.request("GET", url) - if r.status == 200 - return JSON.parse(String(r.body)) - elseif r.status == 110 - println("Please provide data_id.") - elseif r.status == 111 - println("Unknown dataset. Data set description with data_id was not found in the database.") - elseif r.status == 112 - println("No access granted. This dataset is not shared with you.") - end + return JSON.parse(String(r.body)) catch e - println("Error occurred. Check if there exists a dataset with id $id.") - println("See e.g. OpenML.list_datasets()\n") - println(e) + if isa(e, HTTP.StatusError) && e.status == 412 + error = JSON.parse(String(e.response.body))["error"] + @error error["message"] + else + println("Error occurred. Check if there exists a dataset with id $id.") + println("See e.g. OpenML.list_datasets()\n") + println(e) + end return nothing end return nothing From b0b90f4061c353fe7681acdeee52aef1ba4ee8f9 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 09:25:19 +0200 Subject: [PATCH 04/14] update compat --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 8ec59b9..0bdaae6 100644 --- a/Project.toml +++ b/Project.toml @@ -11,8 +11,8 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [compat] -ARFFFiles = "1.3" -HTTP = "0.8, 0.9" +ARFFFiles = "^1.4.1" +HTTP = "0.8, 0.9,1" JSON = "0.21" julia = "1" From 689f94bac0bbb63f78e7f17a4b4905d4ac451ec9 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 10:24:08 +0200 Subject: [PATCH 05/14] refactor GET handling --- src/data.jl | 188 +++++++++++++++------------------------------------- 1 file changed, 53 insertions(+), 135 deletions(-) diff --git a/src/data.jl b/src/data.jl index 2829424..1d18acb 100644 --- a/src/data.jl +++ b/src/data.jl @@ -6,33 +6,45 @@ const API_URL = "https://www.openml.org/api/v1/json" # https://www.openml.org/api_docs#!/data/get_data_id -""" -Returns information about a dataset. The information includes the name, -information about the creator, URL to download it and more. +function error_msg_handling(e) + if isa(e, HTTP.StatusError) && e.status == 412 + try + err = JSON.parse(String(e.response.body))["error"] + msg = err["message"] + code = err["code"] + additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" + @error msg * " " * additional_msg * "(error code $code)" + catch + @error String(e.response.body) + end + else + println(e) + end + return nothing +end -- 110 - Please provide data_id. -- 111 - Unknown dataset. Data set description with data_id was not found in the database. -- 112 - No access granted. This dataset is not shared with you. -""" -function load_Dataset_Description(id::Int) - url = string(API_URL, "/data/$id") +function get(url; extra_error_message = "") try - r = HTTP.request("GET", url) + r = HTTP.request("GET", string(API_URL, url)) return JSON.parse(String(r.body)) catch e - if isa(e, HTTP.StatusError) && e.status == 412 - error = JSON.parse(String(e.response.body))["error"] - @error error["message"] - else - println("Error occurred. Check if there exists a dataset with id $id.") - println("See e.g. OpenML.list_datasets()\n") - println(e) - end - return nothing + error_msg_handling(e) + extra_error_message != "" && println(extra_error_message) end return nothing end +""" + OpenML.load_Dataset_Description(id::Int) + +Returns information about a dataset. The information includes the name, +information about the creator, URL to download it and more. +""" +function load_Dataset_Description(id::Int) + get("data/$id", + extra_error_message = "Check if there is a dataset with id $id.\nSee e.g. OpenML.list_datasets()\n") +end + """ OpenML.load(id) @@ -76,130 +88,37 @@ end """ -Returns a list of all data qualities in the system. + load_Data_Qualities_List() -- 412 - Precondition failed. An error code and message are returned -- 370 - No data qualities available. There are no data qualities in the system. +Returns a list of all data qualities in the system. """ -function load_Data_Qualities_List() - url = string(API_URL, "/data/qualities/list") - try - r = HTTP.request("GET", url) - if r.status == 200 - return JSON.parse(String(r.body)) - elseif r.status == 370 - println("No data qualities available. There are no data qualities in the system.") - end - catch e - println("Error occurred : $e") - return nothing - end - return nothing -end +load_Data_Qualities_List() = get("/data/qualities/list") """ -Returns a list of all data qualities in the system. + load_Data_Qualities(id::Int) -- 271 - Unknown dataset. Data set with the given data ID was not found (or is not shared with you). -- 272 - No features found. The dataset did not contain any features, or we could not extract them. -- 273 - Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes. -- 274 - Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins. +Returns the qualities of dataset `id`. """ -function load_Data_Features(id::Int; api_key::String = "") - if api_key == "" - url = string(API_URL, "/data/features/$id") - end - try - r = HTTP.request("GET", url) - if r.status == 200 - return JSON.parse(String(r.body)) - elseif r.status == 271 - println("Unknown dataset. Data set with the given data ID was not found (or is not shared with you).") - elseif r.status == 272 - println("No features found. The dataset did not contain any features, or we could not extract them.") - elseif r.status == 273 - println("Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.") - elseif r.status == 274 - println("Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins.") - end - catch e - println("Error occurred : $e") - return nothing - end - return nothing -end +load_Data_Qualities(id::Int) = get("/data/qualities/$id") """ -Returns the qualities of a dataset. - -- 360 - Please provide data set ID -- 361 - Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you. -- 362 - No qualities found. The registered dataset did not contain any calculated qualities. -- 363 - Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes. -- 364 - Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team. -- 365 - Interval start or end illegal. There was a problem with the interval start or end. + load_Data_Features(id::Int) + +Returns a list of all data qualities for dataset `id`. """ -function load_Data_Qualities(id::Int; api_key::String = "") - if api_key == "" - url = string(API_URL, "/data/qualities/$id") - end - try - r = HTTP.request("GET", url) - if r.status == 200 - return JSON.parse(String(r.body)) - elseif r.status == 360 - println("Please provide data set ID") - elseif r.status == 361 - println("Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you.") - elseif r.status == 362 - println("No qualities found. The registered dataset did not contain any calculated qualities.") - elseif r.status == 363 - println("Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes.") - elseif r.status == 364 - println("Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team.") - elseif r.status == 365 - println("Interval start or end illegal. There was a problem with the interval start or end.") - end - catch e - println("Error occurred : $e") - return nothing - end - return nothing -end +load_Data_Features(id::Int) = get("/data/features/$id") """ - load_List_And_Filter(filters; api_key = "") + load_List_And_Filter(filters) See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters). """ -function load_List_And_Filter(filters::String; api_key::String = "") - if api_key == "" - url = string(API_URL, "/data/list/$filters") - end - try - r = HTTP.request("GET", url) - if r.status == 200 - return JSON.parse(String(r.body)) - elseif r.status == 370 - println("Illegal filter specified.") - elseif r.status == 371 - println("Filter values/ranges not properly specified.") - elseif r.status == 372 - println("No results. There where no matches for the given constraints.") - elseif r.status == 373 - println("Can not specify an offset without a limit.") - end - catch e - println("Error occurred : $e") - return nothing - end - return nothing -end +load_List_And_Filter(filters::String) = get("/data/list/$filters") qualitynames(x) = haskey(x, "name") ? [x["name"]] : [] """ - list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple) + list_datasets(; tag = nothing, filters = "", output_format = NamedTuple) Lists all active OpenML datasets, if `tag = nothing` (default). To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref). @@ -234,8 +153,8 @@ julia> ds = OpenML.list_datasets( julia> sort!(ds, :NumberOfFeatures) ``` """ -function list_datasets(; tag = nothing, filter = "", filters=filter, - api_key = "", output_format = NamedTuple) +function list_datasets(; tag = nothing, filter = "", filters = filter, + output_format = NamedTuple) if tag !== nothing if is_valid_tag(tag) filters *= "/tag/$tag" @@ -244,7 +163,7 @@ function list_datasets(; tag = nothing, filter = "", filters=filter, return end end - data = OpenML.load_List_And_Filter(filters; api_key = api_key) + data = OpenML.load_List_And_Filter(filters) datasets = data["data"]["dataset"] qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...))) result = merge((id = Int[], name = String[], status = String[]), @@ -275,12 +194,9 @@ is_valid_tag(tag) = false List all available tags. """ function list_tags() - url = string(API_URL, "/data/tag/list") - try - r = HTTP.request("GET", url) - return JSON.parse(String(r.body))["data_tag_list"]["tag"] - catch - return nothing + result = get("/data/tag/list") + if !isnothing(result) + return result["data_tag_list"]["tag"] end end @@ -315,7 +231,9 @@ julia> OpenML.describe_dataset(6) ``` """ function describe_dataset(id) - description = load_Dataset_Description(id)["data_set_description"]["description"] + result = load_Dataset_Description(id) + result === nothing && return + description = result["data_set_description"]["description"] if isa(description, AbstractString) Markdown.parse(description) else From 7e7f86e4bccfd2de6d2cfc5db63f116d2c5e13ba Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 10:24:46 +0200 Subject: [PATCH 06/14] allow to limit the bytes when loading --- src/data.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data.jl b/src/data.jl index 1d18acb..7ab92df 100644 --- a/src/data.jl +++ b/src/data.jl @@ -46,7 +46,7 @@ function load_Dataset_Description(id::Int) end """ - OpenML.load(id) + OpenML.load(id; maxbytes = nothing) Load the OpenML dataset with specified `id`, from those listed by [`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data). @@ -63,9 +63,11 @@ table = OpenML.load(61) df = DataFrame(table) # transform to a DataFrame using ScientificTypes df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types + +peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table ``` """ -function load(id::Int) +function load(id::Int; maxbytes = nothing) if VERSION > v"1.3.0" dir = first(Artifacts.artifacts_dirs()) toml = joinpath(dir, "OpenMLArtifacts.toml") @@ -83,7 +85,7 @@ function load(id::Int) filename = tempname() download(url, filename) end - ARFFFiles.load(filename) + ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes), filename) end From 3633f87b182028bb50744bf21f8c58e8740d5d04 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 10:26:24 +0200 Subject: [PATCH 07/14] bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 0bdaae6..1dd5d2f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "OpenML" uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66" authors = ["Diego Arenas ", "Anthony D. Blaom "] -version = "0.2.0" +version = "0.3.0" [deps] ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" From 2b84617b7d4ba297f092422581096e6cc4da717b Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 10:35:20 +0200 Subject: [PATCH 08/14] adapt to old julia versions --- src/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.jl b/src/data.jl index 7ab92df..251b8c3 100644 --- a/src/data.jl +++ b/src/data.jl @@ -85,7 +85,7 @@ function load(id::Int; maxbytes = nothing) filename = tempname() download(url, filename) end - ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes), filename) + ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes = maxbytes), filename) end From eb4fac7ed2f2053a12078427c30d44d71ca23a1c Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 9 Aug 2022 10:48:42 +0200 Subject: [PATCH 09/14] fix --- src/data.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.jl b/src/data.jl index 251b8c3..bebf7aa 100644 --- a/src/data.jl +++ b/src/data.jl @@ -41,7 +41,7 @@ Returns information about a dataset. The information includes the name, information about the creator, URL to download it and more. """ function load_Dataset_Description(id::Int) - get("data/$id", + get("/data/$id", extra_error_message = "Check if there is a dataset with id $id.\nSee e.g. OpenML.list_datasets()\n") end From 36b6f8ef5495ef921e1a163be96f26e106f17328 Mon Sep 17 00:00:00 2001 From: jbrea Date: Mon, 29 Aug 2022 22:10:58 +0200 Subject: [PATCH 10/14] Update Project.toml Co-authored-by: Anthony Blaom, PhD --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 1dd5d2f..a44b0d9 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,7 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [compat] -ARFFFiles = "^1.4.1" +ARFFFiles = "1.4.1" HTTP = "0.8, 0.9,1" JSON = "0.21" julia = "1" From 1f1cfea1d2a11e3b29eea11cf80c5c6f14b5e4e3 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Mon, 29 Aug 2022 22:34:11 +0200 Subject: [PATCH 11/14] throw error --- src/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data.jl b/src/data.jl index bebf7aa..4922af9 100644 --- a/src/data.jl +++ b/src/data.jl @@ -15,10 +15,10 @@ function error_msg_handling(e) additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" @error msg * " " * additional_msg * "(error code $code)" catch - @error String(e.response.body) + @error e end else - println(e) + throw(e) end return nothing end From ccaee38cf8018e8a5bc92036d6bcff193a7bf2b9 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Mon, 29 Aug 2022 22:34:29 +0200 Subject: [PATCH 12/14] rename argument --- src/data.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data.jl b/src/data.jl index 4922af9..6e44977 100644 --- a/src/data.jl +++ b/src/data.jl @@ -23,9 +23,9 @@ function error_msg_handling(e) return nothing end -function get(url; extra_error_message = "") +function get(query; extra_error_message = "") try - r = HTTP.request("GET", string(API_URL, url)) + r = HTTP.request("GET", string(API_URL, query)) return JSON.parse(String(r.body)) catch e error_msg_handling(e) From 3ce91488f606cf06b9fba6ac1103b8de2203ce52 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 30 Aug 2022 09:01:21 +0200 Subject: [PATCH 13/14] better error handling --- src/data.jl | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/data.jl b/src/data.jl index 6e44977..12ed7cf 100644 --- a/src/data.jl +++ b/src/data.jl @@ -6,30 +6,25 @@ const API_URL = "https://www.openml.org/api/v1/json" # https://www.openml.org/api_docs#!/data/get_data_id -function error_msg_handling(e) - if isa(e, HTTP.StatusError) && e.status == 412 - try - err = JSON.parse(String(e.response.body))["error"] - msg = err["message"] - code = err["code"] - additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" - @error msg * " " * additional_msg * "(error code $code)" - catch - @error e - end - else - throw(e) - end - return nothing -end - function get(query; extra_error_message = "") try r = HTTP.request("GET", string(API_URL, query)) return JSON.parse(String(r.body)) catch e - error_msg_handling(e) - extra_error_message != "" && println(extra_error_message) + if isa(e, HTTP.StatusError) && e.status == 412 + try + err = JSON.parse(String(e.response.body))["error"] + msg = err["message"] + code = err["code"] + additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" + @error msg * " " * additional_msg * "(error code $code)" + catch + @error e + end + extra_error_message != "" && println(extra_error_message) + else + throw(e) + end end return nothing end From 9b4698279e9e59701c57aad0f162635277f2d55c Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 30 Aug 2022 09:57:22 +0200 Subject: [PATCH 14/14] use OpenMLAPIError --- src/data.jl | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/data.jl b/src/data.jl index 12ed7cf..1cf5af3 100644 --- a/src/data.jl +++ b/src/data.jl @@ -1,29 +1,36 @@ const API_URL = "https://www.openml.org/api/v1/json" -# Data API -# The structures are based on these descriptions -# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd -# https://www.openml.org/api_docs#!/data/get_data_id +struct OpenMLAPIError <: Exception + msg::String +end +function Base.showerror(io::IO, e::OpenMLAPIError) + print(io, e.msg) +end -function get(query; extra_error_message = "") +# Data API. See REST API on https://www.openml.org/apis +function get(query) try r = HTTP.request("GET", string(API_URL, query)) return JSON.parse(String(r.body)) catch e if isa(e, HTTP.StatusError) && e.status == 412 - try - err = JSON.parse(String(e.response.body))["error"] - msg = err["message"] - code = err["code"] - additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" - @error msg * " " * additional_msg * "(error code $code)" + error_string = String(e.response.body) + err = try + JSON.parse(error_string)["error"] catch - @error e + @error(error_string) + throw(OpenMLAPIError("Malformed query \"$query\".")) end - extra_error_message != "" && println(extra_error_message) + msg = err["message"] + code = err["code"] + additional_msg = haskey(err, "additional_message") ? err["additional_message"] : "" + if code == "111" + additional_msg *= "Check if there is a dataset with id $(last(split(query, '/'))).\nSee e.g. OpenML.list_datasets(). " + end + throw(OpenMLAPIError(msg * ". " * additional_msg * "(error code $code)")) else - throw(e) + rethrow() end end return nothing @@ -35,10 +42,7 @@ end Returns information about a dataset. The information includes the name, information about the creator, URL to download it and more. """ -function load_Dataset_Description(id::Int) - get("/data/$id", - extra_error_message = "Check if there is a dataset with id $id.\nSee e.g. OpenML.list_datasets()\n") -end +load_Dataset_Description(id::Int) = get("/data/$id") """ OpenML.load(id; maxbytes = nothing)