diff --git a/README.md b/README.md index b4ff181..2945c2b 100644 --- a/README.md +++ b/README.md @@ -40,3 +40,4 @@ Follow the links below for full docs on the usage of the corpora. - [IMDB movie reviews](docs/src/IMDB.md) - [Twitter sentiment dataset](docs/src/Twitter.md) - [Stanford Sentiment Treebank](docs/src/SST.md) + - [GMB](docs/src/GMB.md) diff --git a/docs/make.jl b/docs/make.jl index ed95e17..4e42ba3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -17,7 +17,8 @@ makedocs(modules = [CorpusLoaders], "Twitter" => "Twitter.md", "WikiCorpus" => "WikiCorpus.md", "WikiGold" => "WikiGold.md", - "API References" => "APIReference.md" + "API References" => "APIReference.md", + "GMB" => "GMB.md" ]) diff --git a/docs/src/GMB.md b/docs/src/GMB.md new file mode 100644 index 0000000..caff1b8 --- /dev/null +++ b/docs/src/GMB.md @@ -0,0 +1,69 @@ +# GMB +The dataset an extract from GMB corpus which is tagged, annotated, +and built specifically to train the classifier to predict named entities such as name, location, etc. + +GMB is a fairly large corpus with a lot of annotations. +Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. +The corpus is created by using already existed annotators and then corrected by humans where needed. + +The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations. +The GMB is developed at the [University of Groningen](https://www.rug.nl/). + A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation. + +the part-of-speech tagset used in the Penn Treebank tagset as listed in Ann Taylor, Mitchell Marcus and Beatrice Santorini (2003): [The Penn Treebank](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.9.8216): An Overview, Section 1.1. + + + +For more detail [refer](https://gmb.let.rug.nl/about.php) + +```julia + + Data= load(GMB()) +37789-element Array{Array{PosTaggedWord,1},1}: + [PosTaggedWord("NNS", "Families"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "soldiers"), PosTaggedWord("VBN", "killed"), PosTaggedWord("IN", "in"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "conflict"), PosTaggedWord("VBD", "joined"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "protesters") … PosTaggedWord("CD", "One"), PosTaggedWord("NN", "Terrorist"), PosTaggedWord("RQU", "\""), PosTaggedWord("CC", "and"), PosTaggedWord("LQU", "\""), PosTaggedWord("VB", "Stop"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Bombings"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")] + + [PosTaggedWord("PRP", "They"), PosTaggedWord("VBD", "marched"), PosTaggedWord("IN", "from"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Houses"), PosTaggedWord("IN", "of"), PosTaggedWord("NN", "Parliament"), PosTaggedWord("TO", "to"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "rally"), PosTaggedWord("IN", "in"), PosTaggedWord("NNP", "Hyde"), PosTaggedWord("NNP", "Park"), PosTaggedWord(".", ".")] + + [PosTaggedWord("NNS", "Police"), PosTaggedWord("VBD", "put"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "number"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "marchers"), PosTaggedWord("IN", "at"), PosTaggedWord("CD", "10,000"), PosTaggedWord("IN", "while"), PosTaggedWord("NNS", "organizers"), PosTaggedWord("VBD", "claimed"), PosTaggedWord("PRP", "it"), PosTaggedWord("VBD", "was"), PosTaggedWord("CD", "100,000"), PosTaggedWord(".", ".")] + + ⋮ + + [PosTaggedWord("IN", "At"), PosTaggedWord("JJ", "last"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goatherd"), PosTaggedWord("VBD", "threw"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "stone"), PosTaggedWord(",", ","), PosTaggedWord("CC", "and"), PosTaggedWord("VBG", "breaking") … PosTaggedWord(",", ","), PosTaggedWord("VBD", "begged"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("RB", "not"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "tell"), PosTaggedWord("PRP\$", "his"), PosTaggedWord("NN", "master"), PosTaggedWord(".", ".")] + + [PosTaggedWord("DT", "The"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("VBD", "replied"), PosTaggedWord(",", ","), PosTaggedWord("LQU", "\""), PosTaggedWord("WRB", "Why"), PosTaggedWord(",", ","), PosTaggedWord("PRP", "you"), PosTaggedWord("JJ", "silly"), PosTaggedWord("NN", "fellow") … PosTaggedWord("DT", "the"), PosTaggedWord("NN", "horn"), PosTaggedWord("MD", "will"), PosTaggedWord("VB", "speak"), PosTaggedWord("IN", "though"), PosTaggedWord("PRP", "I"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "silent"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")] + + [PosTaggedWord("VBP", "Do"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "attempt"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "hide"), PosTaggedWord("NNS", "things"), PosTaggedWord("WDT", "which"), PosTaggedWord("MD", "can"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "hid"), PosTaggedWord(".", ".")] + +julia> Data[1] +30-element Array{PosTaggedWord,1}: + PosTaggedWord("NNS", "Families") + PosTaggedWord("IN", "of") + PosTaggedWord("NNS", "soldiers") + PosTaggedWord("VBN", "killed") + PosTaggedWord("IN", "in") + PosTaggedWord("DT", "the") + PosTaggedWord("NN", "conflict") + PosTaggedWord("VBD", "joined") + PosTaggedWord("DT", "the") + PosTaggedWord("NNS", "protesters") + PosTaggedWord("WP", "who") + PosTaggedWord("VBD", "carried") + PosTaggedWord("NNS", "banners") + PosTaggedWord("IN", "with") + ⋮ + PosTaggedWord("IN", "as") + PosTaggedWord("LQU", "\"") + PosTaggedWord("NNP", "Bush") + PosTaggedWord("NN", "Number") + PosTaggedWord("CD", "One") + PosTaggedWord("NN", "Terrorist") + PosTaggedWord("RQU", "\"") + PosTaggedWord("CC", "and") + PosTaggedWord("LQU", "\"") + PosTaggedWord("VB", "Stop") + PosTaggedWord("DT", "the") + PosTaggedWord("NNS", "Bombings") + PosTaggedWord(".", ".") + PosTaggedWord("LQU", "\"") + +``` diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl index 07cf516..31c1a6c 100644 --- a/src/CorpusLoaders.jl +++ b/src/CorpusLoaders.jl @@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord export title, sensekey, word, named_entity, part_of_speech export load -export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000 +export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB function __init__() include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl")) @@ -24,6 +24,7 @@ function __init__() include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl")) include(joinpath(@__DIR__, "WikiGold_DataDeps.jl")) include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl")) + include(joinpath(@__DIR__, "GMB_DataDeps.jl")) end include("types.jl") @@ -38,5 +39,5 @@ include("Twitter.jl") include("StanfordSentimentTreebank.jl") include("WikiGold.jl") include("CoNLL2000.jl") - +include("GMB.jl") end diff --git a/src/GMB.jl b/src/GMB.jl new file mode 100644 index 0000000..c3faef2 --- /dev/null +++ b/src/GMB.jl @@ -0,0 +1,57 @@ +struct GMB{S} + filepath :: Vector{S} +end + +function GMB(dirpath) + @assert(isdir(dirpath), dirpath) + paths = glob("data/*/*/en.tags",dirpath) + GMB(paths) +end + +GMB() = GMB(datadep"GMB 2.2.0") + +MultiResolutionIterators.levelname_map(::Type{GMB}) = [ + :doc=>1, :contextfile=>1, :context=>1, :document=>1, + :para=>2, :paragraph=>2, + :sent=>3, :sentence=>3, + :word=>4, :token=>4, + :char=>5, :character=>5 + ] + +function parse_gmb_tagged_word(line::AbstractString) + tokens_tags = split(line, '\t') + return PosTaggedWord(tokens_tags[2], tokens_tags[1]) +end + +function parse_gmb(filename) + local sent = [] + sents = @NestedVector(PosTaggedWord, 2)() + + function new_sentence() + sent = @NestedVector(PosTaggedWord, 1)() + push!(sents, sent) + end + + + # words + get_tagged(line) = push!(sent, parse_gmb_tagged_word(line)) + + # parse + for line in eachline(filename) + if length(line) == 0 + new_sentence() + else + get_tagged(line) + end + end + return sents +end + +function load(corpus::GMB) + ch = @NestedVector(PosTaggedWord, 2)() + for fn in corpus.filepath + document = parse_gmb(fn) + append!(ch, document) + end + return(ch) +end diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl new file mode 100644 index 0000000..44228ac --- /dev/null +++ b/src/GMB_DataDeps.jl @@ -0,0 +1,33 @@ +using DataDeps + + +for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"), + ("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"), + ("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"), + ("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"), + ("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")] + + register(DataDep("GMB $ver", + """ + Website: https://gmb.let.rug.nl/data.php + Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes + + The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations. + The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank. + A thorough description of the GMB can be found in the Handbook of Linguistic Annotation. + + Please cite the following publication if you use the corpora: + Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496." + """, + "https://gmb.let.rug.nl/releases/gmb-$(ver).zip", + checksum; + post_fetch_method = function (fn) + unpack(fn) + innerdir = "gbm-$(ver)" + innerfiles = readdir(innerdir) + # Move everything to current directory, under same name + mv.(joinpath.(innerdir, innerfiles), innerfiles) + rm(innerdir) + end + )) +end diff --git a/test/test_GMB.jl b/test/test_GMB.jl new file mode 100644 index 0000000..4e54bed --- /dev/null +++ b/test/test_GMB.jl @@ -0,0 +1,21 @@ +using CorpusLoaders +using Test +using Base.Iterators +using MultiResolutionIterators +using DataDeps + +@testset "Using flatten_levels" for path in [datadep"GMB 1.0.0", datadep"GMB 1.1.0", datadep"GMB 2.0.0", datadep"GMB 2.1.0", datadep"GMB 2.2.0"] + train = load(GMB()) + docs = train[1:5] + + words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word))) + @test length(words) > length(docs) + @test typeof(words) == Vector{CorpusLoaders.NerOnlyTaggedWord} + + plain_words = word.(words) + @test typeof(plain_words) <: Vector{String} + + ner_tags = named_entity.(words) + @test typeof(ner_tags) <: Vector{String} + +end