diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl index 3eaa574b..addcce90 100644 --- a/src/MLDatasets.jl +++ b/src/MLDatasets.jl @@ -71,9 +71,12 @@ include("datasets_vision/svhn2.jl") export SVHN2 # Text -include("datasets_text/PTBLM/PTBLM.jl") -include("datasets_text/UD_English/UD_English.jl") -include("datasets_text/SMSSpamCollection/SMSSpamCollection.jl") +include("datasets_text/ptblm.jl") +export PTBLM +include("datasets_text/udenglish.jl") +export UD_English +include("datasets_text/smsspamcollection.jl") +export SMSSpamCollection # Graphs include("datasets_graph/planetoid.jl") @@ -87,12 +90,20 @@ include("datasets_graph/KarateClub/KarateClub.jl") export KarateClub function __init__() + # TODO automatically find and execute __init__xxx functions + + # graph + __init__ogbdataset() + __init__tudataset() # misc __init__iris() __init__mutagenesis() - __init__ogbdataset() - __init__tudataset() + + #text + __init__ptblm() + __init__smsspam() + __init__udenglish() # vision __init__cifar10() @@ -103,4 +114,4 @@ function __init__() __init__svhn2() end -end +end #module diff --git a/src/abstract_datasets.jl b/src/abstract_datasets.jl index 0c1d3359..0a6ec974 100644 --- a/src/abstract_datasets.jl +++ b/src/abstract_datasets.jl @@ -47,6 +47,21 @@ Base.length(d::SupervisedDataset) = numobs((d.features, d.targets)) Base.getindex(d::SupervisedDataset) = getobs((; d.features, d.targets)) Base.getindex(d::SupervisedDataset, i) = getobs((; d.features, d.targets), i) +""" + abstract type UnsupervisedDataset <: AbstractDataset end + +An abstract dataset type for unsupervised or self-supervised learning tasks. +Concrete dataset types inheriting from it must provide +a `features` field. +""" +abstract type UnsupervisedDataset <: AbstractDataset end + + +Base.length(d::UnsupervisedDataset) = numobs(d.features) + +Base.getindex(d::UnsupervisedDataset) = getobs(d.features) +Base.getindex(d::UnsupervisedDataset, i) = getobs(d.features, i) + ### DOCSTRING TEMPLATES ###################### diff --git a/src/datasets_text/PTBLM/PTBLM.jl b/src/datasets_text/PTBLM/PTBLM.jl deleted file mode 100644 index 6551ddd2..00000000 --- a/src/datasets_text/PTBLM/PTBLM.jl +++ /dev/null @@ -1,89 +0,0 @@ -export PTBLM - -""" - PTBLM - -The PTBLM dataset consists of Penn Treebank sentences -for language modeling, available from tomsercu/lstm. -The unknown words are replaced with so that the -total vocaburary size becomes 10000. -""" -module PTBLM - - using DataDeps - using ..MLDatasets: datafile, download_dep - - export - - traindata, - testdata, - - download - - const DEPNAME = "PTBLM" - const TRAINFILE = "ptb.train.txt" - const TESTFILE = "ptb.test.txt" - - download(args...; kw...) = download_dep(DEPNAME, args...; kw...) - - traindata(; dir = nothing) = traindata(dir) - testdata(; dir = nothing) = testdata(dir) - - function traindata(dir) - path = datafile(DEPNAME, TRAINFILE, dir) - xs = readdata(path) - ys = makeys(xs) - xs, ys - end - - function testdata(dir) - path = datafile(DEPNAME, TESTFILE, dir) - xs = readdata(path) - ys = makeys(xs) - xs, ys - end - - function readdata(path) - lines = open(readlines, path) - map(l -> Vector{String}(split(chomp(l))), lines) - end - - function makeys(xs::Vector{Vector{String}}) - map(xs) do x - y = copy(x) - popfirst!(y) - push!(y, "") - end - end - - function __init__() - register(DataDep( - DEPNAME, - """ - Dataset: Penn Treebank sentences for language modeling - Website: https://github.com/tomsercu/lstm - - ----------------------------------------------------- - WARNING: EXPERIMENTAL STATUS - Please be aware that this dataset is from a secondary - source. The provided interface by this package is not - as developed as those for other datasets. We would - welcome any contribution to provide this dataset in a - more mature manner. - ------------------------------------------------------ - - The PTBLM dataset consists of Penn Treebank sentences - for language modeling, available from tomsercu/lstm. - The unknown words are replaced with so that the - total vocaburary size becomes 10000. - - The files are available for download at the github - repository linked above. Note that using the data - responsibly and respecting copyright remains your - responsibility. - """, - "https://raw.githubusercontent.com/tomsercu/lstm/master/data/" .* [TRAINFILE, TESTFILE], - "218f4e6c7288bb5efeb03cc4cb8ae9c04ecd8462ebfba8e13e3549fab69dc25f", - )) - end -end diff --git a/src/datasets_text/SMSSpamCollection/SMSSpamCollection.jl b/src/datasets_text/SMSSpamCollection/SMSSpamCollection.jl deleted file mode 100644 index 8f805d17..00000000 --- a/src/datasets_text/SMSSpamCollection/SMSSpamCollection.jl +++ /dev/null @@ -1,154 +0,0 @@ -export SMSSpamCollection - -""" - SMSSpamCollection - -1. DESCRIPTION --------------- - -The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. - -1.1. Compilation ----------------- - -This corpus has been collected from free or free for research sources at the Web: - -- A collection of between 425 SMS spam messages extracted manually from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: http://www.grumbletext.co.uk/ -- A list of 450 SMS ham messages collected from Caroline Tag's PhD Theses available at http://etheses.bham.ac.uk/253/1/Tagg09PhD.pdf -- A subset of 3,375 SMS ham messages of the NUS SMS Corpus (NSC), which is a corpus of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available. The NUS SMS Corpus is avalaible at: http://www.comp.nus.edu.sg/~rpnlpir/downloads/corpora/smsCorpus/ -- The amount of 1,002 SMS ham messages and 322 spam messages extracted from the SMS Spam Corpus v.0.1 Big created by José María Gómez Hidalgo and public available at: http://www.esp.uem.es/jmgomez/smsspamcorpus/ - - -1.2. Statistics ---------------- - -There is one collection: - -- The SMS Spam Collection v.1 (text file: smsspamcollection) has a total of 4,827 SMS legitimate messages (86.6%) and a total of 747 (13.4%) spam messages. - - -1.3. Format ------------ - -The files contain one message per line. Each line is composed by two columns: one with label (ham or spam) and other with the raw text. Here are some examples: - -ham What you doing?how are you? -ham Ok lar... Joking wif u oni... -ham dun say so early hor... U c already then say... -ham MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H* -ham Siva is in hostel aha:-. -ham Cos i was out shopping wif darren jus now n i called him 2 ask wat present he wan lor. Then he started guessing who i was wif n he finally guessed darren lor. -spam FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop -spam Sunshine Quiz! Win a super Sony DVD recorder if you canname the capital of Australia? Text MQUIZ to 82277. B -spam URGENT! Your Mobile No 07808726822 was awarded a L2,000 Bonus Caller Prize on 02/09/03! This is our 2nd attempt to contact YOU! Call 0871-872-9758 BOX95QU - -Note: messages are not chronologically sorted. - - -2. USAGE --------- - -We offer a comprehensive study of this corpus in the following paper that is under review. This work presents a number of statistics, studies and baseline results for several machine learning methods. - -[1] Almeida, T.A., Gómez Hidalgo, J.M., Yamakami, A. Contributions to the study of SMS Spam Filtering: New Collection and Results. Proceedings of the 2011 ACM Symposium on Document Engineering (ACM DOCENG'11), Mountain View, CA, USA, 2011. (Under review) - - -3. ABOUT --------- - -The corpus has been collected by Tiago Agostinho de Almeida (http://www.dt.fee.unicamp.br/~tiago) and José María Gómez Hidalgo (http://www.esp.uem.es/jmgomez). - -We would like to thank Dr. Min-Yen Kan (http://www.comp.nus.edu.sg/~kanmy/) and his team for making the NUS SMS Corpus available. See: http://www.comp.nus.edu.sg/~rpnlpir/downloads/corpora/smsCorpus/. He is currently collecting a bigger SMS corpus at: http://wing.comp.nus.edu.sg:8080/SMSCorpus/ - -4. LICENSE/DISCLAIMER ---------------------- - -We would appreciate if: - -- In case you find this corpus useful, please make a reference to previous paper and the web page: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/ in your papers, research, etc. -- Send us a message to tiago@dt.fee.unicamp.br in case you make use of the corpus. - -The SMS Spam Collection v.1 is provided for free and with no limitations excepting: - -1. Tiago Agostinho de Almeida and José María Gómez Hidalgo hold the copyrigth (c) for the SMS Spam Collection v.1. - -2. No Warranty/Use At Your Risk. THE CORPUS IS MADE AT NO CHARGE. ACCORDINGLY, THE CORPUS IS PROVIDED `AS IS,' WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION THE WARRANTIES THAT THEY ARE MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. YOU ARE SOLELY RESPONSIBLE FOR YOUR USE, DISTRIBUTION, MODIFICATION, REPRODUCTION AND PUBLICATION OF THE CORPUS AND ANY DERIVATIVE WORKS THEREOF BY YOU AND ANY OF YOUR SUBLICENSEES (COLLECTIVELY, `YOUR CORPUS USE'). THE ENTIRE RISK AS TO YOUR CORPUS USE IS BORNE BY YOU. YOU AGREE TO INDEMNIFY AND HOLD THE COPYRIGHT HOLDERS, AND THEIR AFFILIATES HARMLESS FROM ANY CLAIMS ARISING FROM OR RELATING TO YOUR CORPUS USE. - -3. Limitation of Liability. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR THEIR AFFILIATES, OR THE CORPUS CONTRIBUTING EDITORS, BE LIABLE FOR ANY INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF ADVISED OF THE POSSIBILITY THEREOF, AND REGARDLESS OF WHETHER ANY CLAIM IS BASED UPON ANY CONTRACT, TORT OR OTHER LEGAL OR EQUITABLE THEORY, RELATING OR ARISING FROM THE CORPUS, YOUR CORPUS USE OR THIS LICENSE AGREEMENT. - -# Interface - -- SMSSpamCollection.features` -- SMSSpamCollection.targets - -```julia-repl -julia> using MLDatasets: SMSSpamCollection - -julia> targets = SMSSpamCollection.targets(); - -julia> summary(targets) -"5574-element Vector{Any}" - -julia> targets[1] -"ham" - -julia> summary(features) -"5574-element Vector{Any}" -""" -module SMSSpamCollection - -using DataDeps -using ..MLDatasets: bytes_to_type, datafile, download_dep, download_docstring -using DelimitedFiles - -export features, targets - -const DEPNAME = "SMSSpamCollection" -# const LINK = "https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/" -const LINK = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/" -const DOCS = "https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection#" -const DATA = "smsspamcollection.zip" - -""" - download() -""" -download(args...; kw...) = download_dep(DEPNAME, args...; kw...) - -function __init__() - register(DataDep( - DEPNAME, - """ - Dataset: The SMS Spam Collection v.1 - Website: $DOCS - """, - LINK .* [DATA], - "1587ea43e58e82b14ff1f5425c88e17f8496bfcdb67a583dbff9eefaf9963ce3", - post_fetch_method = unpack - )) -end - -function targets(; dir = nothing) - path = datafile(DEPNAME, "SMSSpamCollection", dir) - f = open(path) - spam_data = readlines(f) - spam_data = [split(str, "\t") for str in spam_data] - targets = [] - for index in 1:length(spam_data) - push!(targets, spam_data[index][1]) - end - targets -end - -function features(; dir = nothing) - path = datafile(DEPNAME, "SMSSpamCollection", dir) - f = open(path) - spam_data = readlines(f) - spam_data = [split(str, "\t") for str in spam_data] - features = [] - for index in 1:length(spam_data) - push!(features, spam_data[index][2]) - end - features -end - -end # module diff --git a/src/datasets_text/UD_English/UD_English.jl b/src/datasets_text/UD_English/UD_English.jl deleted file mode 100644 index 1afa8d3d..00000000 --- a/src/datasets_text/UD_English/UD_English.jl +++ /dev/null @@ -1,116 +0,0 @@ -export UD_English - -""" - UD_English - -Dataset: Universal Dependencies - English Dependency Treebank Universal Dependencies English Web Treebank -Authors: Natalia Silveira and Timothy Dozat and - Marie-Catherine de Marneffe and Samuel - Bowman and Miriam Connor and John Bauer and - Christopher D. Manning -Website: https://github.com/UniversalDependencies/UD_English-EWT - -A Gold Standard Universal Dependencies Corpus for -English, built over the source material of the -English Web Treebank LDC2012T13 -(https://catalog.ldc.upenn.edu/LDC2012T13). -""" -module UD_English - - using DataDeps - using ..MLDatasets - using ..MLDatasets: datafile, download_dep - - export - - traindata, - testdata, - - download - - const DEPNAME = "UD_English" - const TRAINFILE = "en_ewt-ud-train.conllu" - const DEVFILE = "en_ewt-ud-dev.conllu" - const TESTFILE = "en_ewt-ud-test.conllu" - - download(args...; kw...) = download_dep(DEPNAME, args...; kw...) - - traindata(; dir = nothing) = traindata(dir) - devdata(; dir = nothing) = devdata(dir) - testdata(; dir = nothing) = testdata(dir) - - traindata(dir) = readdata(dir, TRAINFILE) - devdata(dir) = readdata(dir, DEVFILE) - testdata(dir) = readdata(dir, TESTFILE) - - function readdata(dir, filename) - path = datafile(DEPNAME, filename, dir) - conll_read(path) - end - - function conll_read(f, path::String) - doc = [] - sent = [] - lines = open(readlines, path) - for line in lines - line = chomp(line) - if length(line) == 0 - length(sent) > 0 && push!(doc, sent) - sent = [] - elseif line[1] == '#' # comment line - continue - else - items = Vector{String}(split(line,'\t')) - push!(sent, f(items)) - end - end - length(sent) > 0 && push!(doc, sent) - T = typeof(doc[1][1]) - Vector{Vector{T}}(doc) - end - - conll_read(path::String) = read(identity, path) - - - function __init__() - register(DataDep( - DEPNAME, - """ - Dataset: Universal Dependencies - English Dependency Treebank Universal Dependencies English Web Treebank - Authors: Natalia Silveira and Timothy Dozat and - Marie-Catherine de Marneffe and Samuel - Bowman and Miriam Connor and John Bauer and - Christopher D. Manning - Website: https://github.com/UniversalDependencies/UD_English-EWT - - A Gold Standard Universal Dependencies Corpus for - English, built over the source material of the - English Web Treebank LDC2012T13 - (https://catalog.ldc.upenn.edu/LDC2012T13). - - You are encouraged to cite this paper if you use the - Universal Dependencies English Web Treebank: - - @inproceedings{silveira14gold, - year = {2014}, - author = {Natalia Silveira and Timothy Dozat - and Marie-Catherine de Marneffe and Samuel - Bowman and Miriam Connor and John Bauer and - Christopher D. Manning}, - title = {A Gold Standard Dependency Corpus for {E}nglish}, - booktitle = {Proceedings of the Ninth - International Conference on Language Resources - and Evaluation (LREC-2014)} - } - - The files are available for download at the github - repository linked above. Note that using the data - responsibly and respecting copyright remains your - responsibility. Copyright and License is discussed in - detail on the Website. - """, - "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/" .* [TRAINFILE, DEVFILE, TESTFILE], - "e08d57e95264ac97ca861261e3119e093c054453c5dfc583e2402459504d93b7" - )) - end -end diff --git a/src/datasets_text/ptblm.jl b/src/datasets_text/ptblm.jl new file mode 100644 index 00000000..c8644e3c --- /dev/null +++ b/src/datasets_text/ptblm.jl @@ -0,0 +1,79 @@ + +function __init__ptblm() + DEPNAME = "PTBLM" + TRAINFILE = "ptb.train.txt" + TESTFILE = "ptb.test.txt" + + register(DataDep( + DEPNAME, + """ + Dataset: Penn Treebank sentences for language modeling + Website: https://github.com/tomsercu/lstm + + The files are available for download at the github + repository linked above. Note that using the data + responsibly and respecting copyright remains your + responsibility. + """, + "https://raw.githubusercontent.com/tomsercu/lstm/master/data/" .* [TRAINFILE, TESTFILE], + "218f4e6c7288bb5efeb03cc4cb8ae9c04ecd8462ebfba8e13e3549fab69dc25f", + )) +end + +""" + PTBLM(; split=:train, dir=nothing) + PTBLM(split; [dir]) + +The PTBLM dataset consists of Penn Treebank sentences +for language modeling, available from https://github.com/tomsercu/lstm. +The unknown words are replaced with so that the +total vocaburary size becomes 10000. +""" +struct PTBLM <: SupervisedDataset + metadata::Dict{String, Any} + split::Symbol + features::Vector{Vector{String}} + targets::Vector{Vector{String}} +end + +PTBLM(; split=:train, dir=nothing) = PTBLM(split; dir) + +function PTBLM(split::Symbol; dir=nothing) + DEPNAME = "PTBLM" + @assert split ∈ [:train, :test] + FILE = split == :train ? "ptb.train.txt" : "ptb.test.txt" + path = datafile(DEPNAME, FILE, dir) + + lines = open(readlines, path) + @assert all(x -> x isa String, lines) + features = map(l -> Vector{String}(Base.split(chomp(l))), lines) + + targets = map(features) do x + y = copy(x) + popfirst!(y) + push!(y, "") + end + + metadata = Dict{String,Any}("n_observations" => length(features)) + return PTBLM(metadata, split, features, targets) +end + + +# DEPRECATED INTERFACE, REMOVE IN v0.7 (or 0.6.x) +function Base.getproperty(::Type{PTBLM}, s::Symbol) + if s === :traindata + @warn "PTBLM.traindata() is deprecated, use `PTBLM(split=:train)[]` instead." maxlog=2 + function traindata(; dir=nothing) + PTBLM(; split=:train, dir)[] + end + return traindata + elseif s === :testdata + @warn "PTBLM.testdata() is deprecated, use `PTBLM(split=:test)[]` instead." maxlog=2 + function testdata(; dir=nothing) + PTBLM(; split=:test, dir)[] + end + return testdata + else + return getfield(PTBLM, s) + end +end diff --git a/src/datasets_text/smsspamcollection.jl b/src/datasets_text/smsspamcollection.jl new file mode 100644 index 00000000..7a70c320 --- /dev/null +++ b/src/datasets_text/smsspamcollection.jl @@ -0,0 +1,82 @@ +function __init__smsspam() + + DEPNAME = "SMSSpamCollection" + LINK = "https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/" + LINK = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/" + DOCS = "https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection#" + DATA = "smsspamcollection.zip" + + register(DataDep( + DEPNAME, + """ + Dataset: The SMS Spam Collection v.1 + Website: $DOCS + """, + LINK .* [DATA], + "1587ea43e58e82b14ff1f5425c88e17f8496bfcdb67a583dbff9eefaf9963ce3", + post_fetch_method = unpack + )) +end + + +""" + SMSSpamCollection(; dir=nothing) + + +The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages +that have been collected for SMS Spam research. It contains one set of SMS messages +in English of 5,574 messages, tagged acording being ham (legitimate) or spam. +The corpus has a total of 4,827 SMS legitimate messages (86.6%) and a total of 747 (13.4%) spam messages. + +The corpus has been collected by Tiago Agostinho de Almeida (http://www.dt.fee.unicamp.br/~tiago) +and José María Gómez Hidalgo (http://www.esp.uem.es/jmgomez). + +```julia-repl +julia> using MLDatasets: SMSSpamCollection + +julia> targets = SMSSpamCollection.targets(); + +julia> summary(targets) +"5574-element Vector{Any}" + +julia> targets[1] +"ham" + +julia> summary(features) +"5574-element Vector{Any}" +""" +struct SMSSpamCollection <: SupervisedDataset + metadata::Dict{String,Any} + features::Vector{String} + targets::Vector{String} +end + +function SMSSpamCollection(; dir = nothing) + DEPNAME = "SMSSpamCollection" + path = datafile(DEPNAME, "SMSSpamCollection", dir) + spam_data = open(readlines, path) + spam_data = [split(str, "\t") for str in spam_data] + @assert all(x -> length(x)==2, spam_data) + targets = [s[1] for s in spam_data] + features = [s[2] for s in spam_data] + + metadata = Dict{String,Any}() + metadata["n_observations"] = length(features) + SMSSpamCollection(metadata, features, targets) +end + + +# DEPRECATED in V0.6 +function Base.getproperty(::Type{SMSSpamCollection}, s::Symbol) + if s == :features + @warn "SMSSpamCollection.features() is deprecated, use `SMSSpamCollection().features` instead." + return () -> SMSSpamCollection().features + elseif s == :targets + @warn "SMSSpamCollection.targets() is deprecated, use `SMSSpamCollection().targets` instead." + return () -> SMSSpamCollection().targets + else + return getfield(Titanic, s) + end +end + + diff --git a/src/datasets_text/udenglish.jl b/src/datasets_text/udenglish.jl new file mode 100644 index 00000000..ef0e83fc --- /dev/null +++ b/src/datasets_text/udenglish.jl @@ -0,0 +1,123 @@ +function __init__udenglish() + DEPNAME = "UD_English" + TRAINFILE = "en_ewt-ud-train.conllu" + DEVFILE = "en_ewt-ud-dev.conllu" + TESTFILE = "en_ewt-ud-test.conllu" + + register(DataDep( + DEPNAME, + """ + Dataset: Universal Dependencies - English Dependency Treebank Universal Dependencies English Web Treebank + Authors: Natalia Silveira and Timothy Dozat and + Marie-Catherine de Marneffe and Samuel + Bowman and Miriam Connor and John Bauer and + Christopher D. Manning + Website: https://github.com/UniversalDependencies/UD_English-EWT + + + The files are available for download at the github + repository linked above. Note that using the data + responsibly and respecting copyright remains your + responsibility. Copyright and License is discussed in + detail on the Website. + """, + "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/" .* [TRAINFILE, DEVFILE, TESTFILE], + "6df6ee25ab3cd1cde3a09ab075dcc6b8c90d18648eef0809f400be4ad8bc81e2" + )) +end + +""" + UD_English(; split=:train, dir=nothing) + UD_English(split=; [dir]) + +A Gold Standard Universal Dependencies Corpus for +English, built over the source material of the +English Web Treebank LDC2012T13 +(https://catalog.ldc.upenn.edu/LDC2012T13). + +The corpus comprises 254,825 words and 16,621 sentences, +taken from five genres of web media: weblogs, newsgroups, emails, reviews, and Yahoo! answers. +See the LDC2012T13 documentation for more details on the sources of the sentences. +The trees were automatically converted into Stanford Dependencies and then hand-corrected to Universal Dependencies. +All the basic dependency annotations have been single-annotated, a limited portion of them have been double-annotated, +and subsequent correction has been done to improve consistency. Other aspects of the treebank, such as Universal POS, +features and enhanced dependencies, has mainly been done automatically, with very limited hand-correction. + + +Authors: Natalia Silveira and Timothy Dozat and + Marie-Catherine de Marneffe and Samuel + Bowman and Miriam Connor and John Bauer and + Christopher D. Manning +Website: https://github.com/UniversalDependencies/UD_English-EWT +""" +struct UD_English <: UnsupervisedDataset + metadata::Dict{String,Any} + split::Symbol + features::Vector{Vector{Vector{String}}} +end + +UD_English(; split=:train, dir=nothing) = UD_English(split; dir) + +function UD_English(split::Symbol; dir=nothing) + DEPNAME = "UD_English" + TRAINFILE = "en_ewt-ud-train.conllu" + DEVFILE = "en_ewt-ud-dev.conllu" + TESTFILE = "en_ewt-ud-test.conllu" + + @assert split ∈ [:train, :test, :dev] + + + FILE = split == :train ? TRAINFILE : + split == :test ? TESTFILE : + split === :dev ? DEVFILE : error() + + path = datafile(DEPNAME, FILE, dir) + + doc = [] + sent = [] + lines = open(readlines, path) + for line in lines + line = chomp(line) + if length(line) == 0 + length(sent) > 0 && push!(doc, sent) + sent = [] + elseif line[1] == '#' # comment line + continue + else + items = Vector{String}(Base.split(line,'\t')) + push!(sent, items) + end + end + length(sent) > 0 && push!(doc, sent) + T = typeof(doc[1][1]) + features = Vector{Vector{T}}(doc) + + metadata = Dict{String,Any}("n_observations" => length(features)) + return UD_English(metadata, split, features) +end + + +# DEPRECATED INTERFACE, REMOVE IN v0.7 (or 0.6.x) +function Base.getproperty(::Type{UD_English}, s::Symbol) + if s === :traindata + @warn "UD_English.traindata() is deprecated, use `UD_English(split=:train)[]` instead." maxlog=2 + function traindata(; dir=nothing) + UD_English(; split=:train, dir)[] + end + return traindata + elseif s === :testdata + @warn "UD_English.testdata() is deprecated, use `UD_English(split=:test)[]` instead." maxlog=2 + function testdata(; dir=nothing) + UD_English(; split=:test, dir)[] + end + return testdata + elseif s === :devdata + @warn "UD_English.devdata() is deprecated, use `UD_English(split=:dev)[]` instead." maxlog=2 + function devdata(; dir=nothing) + UD_English(; split=:dev, dir)[] + end + return devdata + else + return getfield(UD_English, s) + end +end diff --git a/test/datasets_graph/karateclub.jl b/test/datasets_graph/deprecated_karateclub.jl similarity index 100% rename from test/datasets_graph/karateclub.jl rename to test/datasets_graph/deprecated_karateclub.jl diff --git a/test/datasets_misc/datasets_misc.jl b/test/datasets_misc/misc.jl similarity index 100% rename from test/datasets_misc/datasets_misc.jl rename to test/datasets_misc/misc.jl diff --git a/test/datasets_text/deprecated_smsspamcollection.jl b/test/datasets_text/deprecated_smsspamcollection.jl deleted file mode 100644 index ed7125f4..00000000 --- a/test/datasets_text/deprecated_smsspamcollection.jl +++ /dev/null @@ -1,14 +0,0 @@ -module SMSSpamCollection_Tests -using Test -using MLDatasets - -@testset "SMS Spam Collection" begin - X = SMSSpamCollection.features() - y = SMSSpamCollection.targets() - @test X isa Vector - @test y isa Vector - @test size(X) == (5574,) - @test size(y) == (5574,) -end - -end # module \ No newline at end of file diff --git a/test/datasets_text/deprecated_text.jl b/test/datasets_text/deprecated_text.jl index 477f0ea5..3531e1a0 100644 --- a/test/datasets_text/deprecated_text.jl +++ b/test/datasets_text/deprecated_text.jl @@ -1,14 +1,21 @@ -#temporary to not stress CI -if !parse(Bool, get(ENV, "CI", "false")) - @testset "PTBLM" begin - x, y = PTBLM.traindata() - x, y = PTBLM.testdata() - - end - @testset "UD_English" begin - x = UD_English.traindata() - x = UD_English.devdata() - x = UD_English.testdata() - end -end \ No newline at end of file +@testset "PTBLM" begin + x, y = PTBLM.traindata() + x, y = PTBLM.testdata() + +end + +@testset "UD_English" begin + x = UD_English.traindata() + x = UD_English.devdata() + x = UD_English.testdata() +end + +@testset "SMS Spam Collection" begin + X = SMSSpamCollection.features() + y = SMSSpamCollection.targets() + @test X isa Vector + @test y isa Vector + @test size(X) == (5574,) + @test size(y) == (5574,) +end diff --git a/test/datasets_text/text.jl b/test/datasets_text/text.jl new file mode 100644 index 00000000..d53ed4d5 --- /dev/null +++ b/test/datasets_text/text.jl @@ -0,0 +1,40 @@ + +@testset "PTBLM" begin + n_obs = 5574 + n_targets = 1 + n_features = () + Tx=String + Ty=String + + d = SMSSpamCollection() + + test_supervised_array_dataset(d; + n_obs, n_targets, n_features, + Tx, Ty) +end + + +@testset "SMSSpamCollection" begin + n_obs = 5574 + n_targets = 1 + n_features = () + Tx=String + Ty=String + + d = SMSSpamCollection() + + test_supervised_array_dataset(d; + n_obs, n_targets, n_features, + Tx, Ty) +end + +@testset "UD_English" begin + n_features = () + Tx = Vector{Vector{String}} + for (n_obs, split) in [(12543, :train), (2077, :test), (2001, :dev)] + d = UD_English(split) + + test_unsupervised_array_dataset(d; + n_obs, n_features, Tx) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index f0a51d33..28e4ec02 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ using Test using MLDatasets -using MLDatasets: SupervisedDataset, AbstractDataset +using MLDatasets: SupervisedDataset, UnsupervisedDataset, AbstractDataset using FileIO using DataDeps using DataFrames, CSV, Tables @@ -18,8 +18,8 @@ include("test_utils.jl") # we comment out deprecated test dataset_tests = [ - #### misc - "datasets_misc/datasets_misc.jl", + ### misc + "datasets_misc/misc.jl", # "datasets_misc/deprecated_misc.jl", #### vision "datasets_vision/emnist.jl", @@ -33,10 +33,10 @@ dataset_tests = [ "datasets_graph/deprecated_pubmed.jl", "datasets_graph/deprecated_tudataset.jl", "datasets_graph/deprecated_polblogs.jl", - "datasets_graph/karateclub.jl", + "datasets_graph/deprecated_karateclub.jl", #### text - "datasets_text/deprecated_text.jl", - "datasets_text/deprecated_smsspamcollection.jl", + "datasets_text/text.jl", + # "datasets_text/deprecated_text.jl", ] no_ci_dataset_tests = [ diff --git a/test/test_utils.jl b/test/test_utils.jl index 617832ba..0139defd 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -55,10 +55,13 @@ function test_inmemory_supervised_table_dataset(d::D; @test d[] === (; d.features, d.targets) @test length(d) == n_obs + @test numobs(d) == n_obs idx = rand(1:n_obs) @test isequal(d[idx], getobs((; d.features, d.targets), idx)) + @test isequal(d[idx], getobs(d, idx)) idxs = rand(1:n_obs, 2) @test isequal(d[idxs], getobs((; d.features, d.targets), idxs)) + @test isequal(d[idxs], getobs(d, idxs)) end @@ -67,7 +70,12 @@ function test_supervised_array_dataset(d::D; Tx=Any, Ty=Any, conv2img=false) where {D<:SupervisedDataset} - Nx = length(n_features) + 1 + if n_features isa Int + @assert n_features != 0 "use n_features=() if you don't want features dimensions" + Nx == 2 + else # tuple + Nx = length(n_features) + 1 + end Ny = map(x -> x == 1 ? 1 : 2, n_targets) @test d.features isa Array{Tx, Nx} @@ -93,15 +101,56 @@ function test_supervised_array_dataset(d::D; end @test length(d) == n_obs + @test numobs(d) == n_obs X, y = d[] @test X === d.features @test y === d.targets idx = rand(1:n_obs) @test isequal(d[idx], getobs((; d.features, d.targets), idx)) + @test isequal(d[idx], getobs(d, idx)) idxs = rand(1:n_obs, 2) @test isequal(d[idxs], getobs((; d.features, d.targets), idxs)) + @test isequal(d[idxs], getobs(d, idxs)) + + if conv2img + img = convert2image(d, 1) + @test img isa AbstractArray{<:Color} + x = d[1].features + @test convert2image(D, x) == img + @test convert2image(d, x) == img + end +end + +function test_unsupervised_array_dataset(d::D; + n_obs, n_features, + Tx=Any, + conv2img=false) where {D<:UnsupervisedDataset} + + n_features = n_features === nothing ? () : n_features + if n_features isa Int + @assert n_features != 0 "use n_features = () if you don't want features dimensions" + Nx == 2 + else # tuple + Nx = length(n_features) + 1 + end + + @test d.features isa Array{Tx, Nx} + @test size(d.features) == (n_features..., n_obs) + + @test length(d) == n_obs + @test numobs(d) == n_obs + X = d[] + @test X === d.features + + idx = rand(1:n_obs) + @test isequal(d[idx], getobs(d.features, idx)) + @test isequal(d[idx], getobs(d, idx)) + idxs = rand(1:n_obs, 2) + @test isequal(d[idxs], getobs(d.features, idxs)) + @test isequal(d[idxs], getobs(d, idxs)) + if conv2img img = convert2image(d, 1) @test img isa AbstractArray{<:Color}