Skip to content

Commit

Permalink
Merge pull request #19 from JuliaAI/dev
Browse files Browse the repository at this point in the history
For a 0.3.0 release
  • Loading branch information
ablaom authored Aug 30, 2022
2 parents 4444745 + ec9cd80 commit 738790c
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 150 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "OpenML"
uuid = "8b6db2d4-7670-4922-a472-f9537c81ab66"
authors = ["Diego Arenas <[email protected]>", "Anthony D. Blaom <[email protected]>"]
version = "0.2.0"
version = "0.3.0"

[deps]
ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
Expand All @@ -11,8 +11,8 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

[compat]
ARFFFiles = "1.3"
HTTP = "0.8, 0.9"
ARFFFiles = "1.4.1"
HTTP = "0.8, 0.9,1"
JSON = "0.21"
julia = "1"

Expand Down
210 changes: 63 additions & 147 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,43 +1,51 @@
const API_URL = "https://www.openml.org/api/v1/json"

# Data API
# The structures are based on these descriptions
# https://github.com/openml/OpenML/tree/master/openml_OS/views/pages/api_new/v1/xsd
# https://www.openml.org/api_docs#!/data/get_data_id
struct OpenMLAPIError <: Exception
msg::String
end
function Base.showerror(io::IO, e::OpenMLAPIError)
print(io, e.msg)
end


"""
Returns information about a dataset. The information includes the name,
information about the creator, URL to download it and more.
- 110 - Please provide data_id.
- 111 - Unknown dataset. Data set description with data_id was not found in the database.
- 112 - No access granted. This dataset is not shared with you.
"""
function load_Dataset_Description(id::Int; api_key::String="")
url = string(API_URL, "/data/$id")
# Data API. See REST API on https://www.openml.org/apis
function get(query)
try
r = HTTP.request("GET", url)
if r.status == 200
return JSON.parse(String(r.body))
elseif r.status == 110
println("Please provide data_id.")
elseif r.status == 111
println("Unknown dataset. Data set description with data_id was not found in the database.")
elseif r.status == 112
println("No access granted. This dataset is not shared with you.")
end
r = HTTP.request("GET", string(API_URL, query))
return JSON.parse(String(r.body))
catch e
println("Error occurred. Check if there exists a dataset with id $id.")
println("See e.g. OpenML.list_datasets()\n")
println(e)
return nothing
if isa(e, HTTP.StatusError) && e.status == 412
error_string = String(e.response.body)
err = try
JSON.parse(error_string)["error"]
catch
@error(error_string)
throw(OpenMLAPIError("Malformed query \"$query\"."))
end
msg = err["message"]
code = err["code"]
additional_msg = haskey(err, "additional_message") ? err["additional_message"] : ""
if code == "111"
additional_msg *= "Check if there is a dataset with id $(last(split(query, '/'))).\nSee e.g. OpenML.list_datasets(). "
end
throw(OpenMLAPIError(msg * ". " * additional_msg * "(error code $code)"))
else
rethrow()
end
end
return nothing
end

"""
OpenML.load(id)
OpenML.load_Dataset_Description(id::Int)
Returns information about a dataset. The information includes the name,
information about the creator, URL to download it and more.
"""
load_Dataset_Description(id::Int) = get("/data/$id")

"""
OpenML.load(id; maxbytes = nothing)
Load the OpenML dataset with specified `id`, from those listed by
[`list_datasets`](@ref) or on the [OpenML site](https://www.openml.org/search?type=data).
Expand All @@ -54,9 +62,11 @@ table = OpenML.load(61)
df = DataFrame(table) # transform to a DataFrame
using ScientificTypes
df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types
peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table
```
"""
function load(id::Int)
function load(id::Int; maxbytes = nothing)
if VERSION > v"1.3.0"
dir = first(Artifacts.artifacts_dirs())
toml = joinpath(dir, "OpenMLArtifacts.toml")
Expand All @@ -74,135 +84,42 @@ function load(id::Int)
filename = tempname()
download(url, filename)
end
ARFFFiles.load(filename)
ARFFFiles.load(x -> ARFFFiles.readcolumns(x; maxbytes = maxbytes), filename)
end


"""
Returns a list of all data qualities in the system.
load_Data_Qualities_List()
- 412 - Precondition failed. An error code and message are returned
- 370 - No data qualities available. There are no data qualities in the system.
Returns a list of all data qualities in the system.
"""
function load_Data_Qualities_List()
url = string(API_URL, "/data/qualities/list")
try
r = HTTP.request("GET", url)
if r.status == 200
return JSON.parse(String(r.body))
elseif r.status == 370
println("No data qualities available. There are no data qualities in the system.")
end
catch e
println("Error occurred : $e")
return nothing
end
return nothing
end
load_Data_Qualities_List() = get("/data/qualities/list")

"""
Returns a list of all data qualities in the system.
load_Data_Qualities(id::Int)
- 271 - Unknown dataset. Data set with the given data ID was not found (or is not shared with you).
- 272 - No features found. The dataset did not contain any features, or we could not extract them.
- 273 - Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.
- 274 - Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins.
Returns the qualities of dataset `id`.
"""
function load_Data_Features(id::Int; api_key::String = "")
if api_key == ""
url = string(API_URL, "/data/features/$id")
end
try
r = HTTP.request("GET", url)
if r.status == 200
return JSON.parse(String(r.body))
elseif r.status == 271
println("Unknown dataset. Data set with the given data ID was not found (or is not shared with you).")
elseif r.status == 272
println("No features found. The dataset did not contain any features, or we could not extract them.")
elseif r.status == 273
println("Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.")
elseif r.status == 274
println("Dataset processed with error. The feature extractor has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, please contact the API admins.")
end
catch e
println("Error occurred : $e")
return nothing
end
return nothing
end
load_Data_Qualities(id::Int) = get("/data/qualities/$id")

"""
Returns the qualities of a dataset.
- 360 - Please provide data set ID
- 361 - Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you.
- 362 - No qualities found. The registered dataset did not contain any calculated qualities.
- 363 - Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes.
- 364 - Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team.
- 365 - Interval start or end illegal. There was a problem with the interval start or end.
load_Data_Features(id::Int)
Returns a list of all data qualities for dataset `id`.
"""
function load_Data_Qualities(id::Int; api_key::String = "")
if api_key == ""
url = string(API_URL, "/data/qualities/$id")
end
try
r = HTTP.request("GET", url)
if r.status == 200
return JSON.parse(String(r.body))
elseif r.status == 360
println("Please provide data set ID")
elseif r.status == 361
println("Unknown dataset. The data set with the given ID was not found in the database, or is not shared with you.")
elseif r.status == 362
println("No qualities found. The registered dataset did not contain any calculated qualities.")
elseif r.status == 363
println("Dataset not processed yet. The dataset was not processed yet, no qualities are available. Please wait for a few minutes.")
elseif r.status == 364
println("Dataset processed with error. The quality calculator has run into an error while processing the dataset. Please check whether it is a valid supported file. If so, contact the support team.")
elseif r.status == 365
println("Interval start or end illegal. There was a problem with the interval start or end.")
end
catch e
println("Error occurred : $e")
return nothing
end
return nothing
end
load_Data_Features(id::Int) = get("/data/features/$id")

"""
load_List_And_Filter(filters; api_key = "")
load_List_And_Filter(filters)
See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
"""
function load_List_And_Filter(filters::String; api_key::String = "")
if api_key == ""
url = string(API_URL, "/data/list/$filters")
end
try
r = HTTP.request("GET", url)
if r.status == 200
return JSON.parse(String(r.body))
elseif r.status == 370
println("Illegal filter specified.")
elseif r.status == 371
println("Filter values/ranges not properly specified.")
elseif r.status == 372
println("No results. There where no matches for the given constraints.")
elseif r.status == 373
println("Can not specify an offset without a limit.")
end
catch e
println("Error occurred : $e")
return nothing
end
return nothing
end
load_List_And_Filter(filters::String) = get("/data/list/$filters")

qualitynames(x) = haskey(x, "name") ? [x["name"]] : []

"""
list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple)
list_datasets(; tag = nothing, filters = "", output_format = NamedTuple)
Lists all active OpenML datasets, if `tag = nothing` (default).
To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref).
Expand Down Expand Up @@ -237,8 +154,8 @@ julia> ds = OpenML.list_datasets(
julia> sort!(ds, :NumberOfFeatures)
```
"""
function list_datasets(; tag = nothing, filter = "", filters=filter,
api_key = "", output_format = NamedTuple)
function list_datasets(; tag = nothing, filter = "", filters = filter,
output_format = NamedTuple)
if tag !== nothing
if is_valid_tag(tag)
filters *= "/tag/$tag"
Expand All @@ -247,7 +164,7 @@ function list_datasets(; tag = nothing, filter = "", filters=filter,
return
end
end
data = OpenML.load_List_And_Filter(filters; api_key = api_key)
data = OpenML.load_List_And_Filter(filters)
datasets = data["data"]["dataset"]
qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...)))
result = merge((id = Int[], name = String[], status = String[]),
Expand Down Expand Up @@ -278,12 +195,9 @@ is_valid_tag(tag) = false
List all available tags.
"""
function list_tags()
url = string(API_URL, "/data/tag/list")
try
r = HTTP.request("GET", url)
return JSON.parse(String(r.body))["data_tag_list"]["tag"]
catch
return nothing
result = get("/data/tag/list")
if !isnothing(result)
return result["data_tag_list"]["tag"]
end
end

Expand Down Expand Up @@ -318,7 +232,9 @@ julia> OpenML.describe_dataset(6)
```
"""
function describe_dataset(id)
description = load_Dataset_Description(id)["data_set_description"]["description"]
result = load_Dataset_Description(id)
result === nothing && return
description = result["data_set_description"]["description"]
if isa(description, AbstractString)
Markdown.parse(description)
else
Expand Down

0 comments on commit 738790c

Please sign in to comment.