From 3c86c51e52f2a7ec176bad4980ba284dae8293b3 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 8 Mar 2020 08:39:58 +0530 Subject: [PATCH 01/16] adding GMB.jl --- src/GMB.jl | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/GMB.jl diff --git a/src/GMB.jl b/src/GMB.jl new file mode 100644 index 0000000..e543ce6 --- /dev/null +++ b/src/GMB.jl @@ -0,0 +1,71 @@ +struct GMB{S} + filepath :: Vector{S} +end + +function GMB(dirpath) + @assert(isdir(dirpath), dirpath) + paths=String[] + data_path = joinpath.(dirpath,"data") + for dir in readdir(data_path) + for d in readdir(joinpath.(data_path,dir)) + if ("en.tags" in readdir(joinpath.(data_path,dir,d))) == true + push!(paths,joinpath.(data_path,dir,d,"en.tags")) + end + + end + end + +GMB(paths) + +end + +GMB() = GMB(datadep"GMB 2.2.0") + +MultiResolutionIterators.levelname_map(::Type{GMB}) = [ + :doc=>1, :contextfile=>1, :context=>1, :document=>1, + :para=>2, :paragraph=>2, + :sent=>3, :sentence=>3, + :word=>4, :token=>4, + :char=>5, :character=>5 + ] + +function parse_gmb_tagged_word(line::AbstractString) + tokens_tags = split(line," ") + return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1]) +end + +function parse_gmb(filename) + local sent=[] + + sents = @NestedVector(NerOnlyTaggedWord, 2)() + context = Document(intern(basename(filename)), sents) + + function new_sentence() + sent = @NestedVector(NerOnlyTaggedWord,1)() + push!(sents, sent) + end + + + # words + get_tagged(line) = push!(sent, parse_gmb_tagged_word(line)) + + # parse + +for line in eachline(filename) + if length(line) == 0 + new_sentence() + else + get_tagged(line) + end + end + return context +end + +function load(corpus::GMB) + ch=[] + for fn in corpus.filepath + document = parse_gmb(fn) + append!(ch, document) + end + return(ch) +end From 18b393914fa7fb2621b56fa110264aa2fb8355e5 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 8 Mar 2020 08:41:10 +0530 Subject: [PATCH 02/16] adding GMB_DataDeps --- src/GMB_DataDeps.jl | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 src/GMB_DataDeps.jl diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl new file mode 100644 index 0000000..9136a84 --- /dev/null +++ b/src/GMB_DataDeps.jl @@ -0,0 +1,33 @@ +using DataDeps + + +for (ver, checksum) in [("1.0.0", "16814254fe194d55a2fcc24858aa76d71de3c49e495bd98478cc7345e766d8b7"), + ("1.1.0", "0495577ac3a87c2a64fe6189798ea046de0f44943dfb7b60fe38cf648d34c421"), + ("2.0.0", "70b9eb7ca0dc9d67655f9d671d40be10aeff490f0bea4f10cb1946127b74c102"), + ("2.1.0", "93fbae725f0125dedb7369403fda1dace85b2dcd8a523ed80af23e863b18ef2c"), + ("2.2.0", "0714f07dbcb84a215d668f3ee85892fa8fa4a8154439662eb7529413367b8f56")] + + register(DataDep("GMB $ver", + """ + Website: https://gmb.let.rug.nl/data.php + Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes + + The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations. + The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank. + A thorough description of the GMB can be found in the Handbook of Linguistic Annotation. + + Please cite the following publication if you use the corpora: + Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496." + """, + "https://gmb.let.rug.nl/releases/gmb-$(ver).zip", + checksum; + post_fetch_method = fn -> begin + unpack(fn) + innerdir = "gbm-$(ver)" + innerfiles = readdir(innerdir) + # Move everything to current directory, under same name + mv.(joinpath.(innerdir, innerfiles), innerfiles) + rm(innerdir) + end + )) +end From 45f0867f73514de2b1f2dafbcf7f533e8babd21c Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 8 Mar 2020 09:46:41 +0530 Subject: [PATCH 03/16] checksumupdate --- src/GMB_DataDeps.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl index 9136a84..62e4101 100644 --- a/src/GMB_DataDeps.jl +++ b/src/GMB_DataDeps.jl @@ -1,11 +1,11 @@ using DataDeps -for (ver, checksum) in [("1.0.0", "16814254fe194d55a2fcc24858aa76d71de3c49e495bd98478cc7345e766d8b7"), - ("1.1.0", "0495577ac3a87c2a64fe6189798ea046de0f44943dfb7b60fe38cf648d34c421"), - ("2.0.0", "70b9eb7ca0dc9d67655f9d671d40be10aeff490f0bea4f10cb1946127b74c102"), - ("2.1.0", "93fbae725f0125dedb7369403fda1dace85b2dcd8a523ed80af23e863b18ef2c"), - ("2.2.0", "0714f07dbcb84a215d668f3ee85892fa8fa4a8154439662eb7529413367b8f56")] +for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"), + ("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"), + ("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"), + ("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"), + ("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")] register(DataDep("GMB $ver", """ From c21c4dd551765bbf7567fc0382c8942f8bc180b3 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 8 Mar 2020 10:16:21 +0530 Subject: [PATCH 04/16] updating module --- src/CorpusLoaders.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl index 07cf516..31c1a6c 100644 --- a/src/CorpusLoaders.jl +++ b/src/CorpusLoaders.jl @@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord export title, sensekey, word, named_entity, part_of_speech export load -export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000 +export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB function __init__() include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl")) @@ -24,6 +24,7 @@ function __init__() include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl")) include(joinpath(@__DIR__, "WikiGold_DataDeps.jl")) include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl")) + include(joinpath(@__DIR__, "GMB_DataDeps.jl")) end include("types.jl") @@ -38,5 +39,5 @@ include("Twitter.jl") include("StanfordSentimentTreebank.jl") include("WikiGold.jl") include("CoNLL2000.jl") - +include("GMB.jl") end From 65b49f192efe970a5f3fdcc3cda6ff9f87b9525a Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Wed, 11 Mar 2020 20:52:44 +0530 Subject: [PATCH 05/16] Update src/GMB.jl Co-Authored-By: Lyndon White --- src/GMB.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GMB.jl b/src/GMB.jl index e543ce6..a2f9636 100644 --- a/src/GMB.jl +++ b/src/GMB.jl @@ -8,7 +8,7 @@ function GMB(dirpath) data_path = joinpath.(dirpath,"data") for dir in readdir(data_path) for d in readdir(joinpath.(data_path,dir)) - if ("en.tags" in readdir(joinpath.(data_path,dir,d))) == true + if ispath(joinpath(data_path, dir, d, "en.tags"))) push!(paths,joinpath.(data_path,dir,d,"en.tags")) end From d78f5be90681afd46485499b357e840283579e81 Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Wed, 11 Mar 2020 20:53:57 +0530 Subject: [PATCH 06/16] Update src/GMB.jl Co-Authored-By: Lyndon White --- src/GMB.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GMB.jl b/src/GMB.jl index a2f9636..c79a485 100644 --- a/src/GMB.jl +++ b/src/GMB.jl @@ -41,7 +41,7 @@ function parse_gmb(filename) context = Document(intern(basename(filename)), sents) function new_sentence() - sent = @NestedVector(NerOnlyTaggedWord,1)() + sent = @NestedVector(NerOnlyTaggedWord, 1)() push!(sents, sent) end From 0ebe1e85ce6501cd70f7b46f2af31f0900563bb7 Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Wed, 11 Mar 2020 21:08:41 +0530 Subject: [PATCH 07/16] Update src/GMB.jl Co-Authored-By: Lyndon White --- src/GMB.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GMB.jl b/src/GMB.jl index c79a485..58f5ab5 100644 --- a/src/GMB.jl +++ b/src/GMB.jl @@ -30,7 +30,7 @@ MultiResolutionIterators.levelname_map(::Type{GMB}) = [ ] function parse_gmb_tagged_word(line::AbstractString) - tokens_tags = split(line," ") + tokens_tags = split(line, '\t') return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1]) end From 35fda3bd60c71e9c12e76aa67aeeb7e3c190d060 Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Fri, 13 Mar 2020 19:33:47 +0530 Subject: [PATCH 08/16] Update src/GMB_DataDeps.jl Co-Authored-By: Lyndon White --- src/GMB_DataDeps.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl index 62e4101..44228ac 100644 --- a/src/GMB_DataDeps.jl +++ b/src/GMB_DataDeps.jl @@ -21,7 +21,7 @@ for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f224259 """, "https://gmb.let.rug.nl/releases/gmb-$(ver).zip", checksum; - post_fetch_method = fn -> begin + post_fetch_method = function (fn) unpack(fn) innerdir = "gbm-$(ver)" innerfiles = readdir(innerdir) From 2bb03e8c64380d8f76d8a48eb2275f123728706e Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 14 Mar 2020 05:18:23 +0530 Subject: [PATCH 09/16] indentation correction with use of glob --- src/GMB.jl | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/GMB.jl b/src/GMB.jl index 58f5ab5..89707a4 100644 --- a/src/GMB.jl +++ b/src/GMB.jl @@ -4,19 +4,8 @@ end function GMB(dirpath) @assert(isdir(dirpath), dirpath) - paths=String[] - data_path = joinpath.(dirpath,"data") - for dir in readdir(data_path) - for d in readdir(joinpath.(data_path,dir)) - if ispath(joinpath(data_path, dir, d, "en.tags"))) - push!(paths,joinpath.(data_path,dir,d,"en.tags")) - end - - end - end - -GMB(paths) - + paths=glob("data/*/*/en.tags",dirpath) + GMB(paths) end GMB() = GMB(datadep"GMB 2.2.0") @@ -35,10 +24,8 @@ function parse_gmb_tagged_word(line::AbstractString) end function parse_gmb(filename) - local sent=[] - - sents = @NestedVector(NerOnlyTaggedWord, 2)() - context = Document(intern(basename(filename)), sents) + local sent=[] + sents = @NestedVector(NerOnlyTaggedWord, 2)() function new_sentence() sent = @NestedVector(NerOnlyTaggedWord, 1)() @@ -50,22 +37,21 @@ function parse_gmb(filename) get_tagged(line) = push!(sent, parse_gmb_tagged_word(line)) # parse - -for line in eachline(filename) + for line in eachline(filename) if length(line) == 0 new_sentence() else get_tagged(line) end - end - return context + end + return sents end function load(corpus::GMB) - ch=[] - for fn in corpus.filepath - document = parse_gmb(fn) - append!(ch, document) - end - return(ch) + ch=[] + for fn in corpus.filepath + document = parse_gmb(fn) + append!(ch, document) + end + return(ch) end From 9fa1deb3ec05dd7aab610a8a27a4702c2bca5306 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 14 Mar 2020 05:53:35 +0530 Subject: [PATCH 10/16] updated_docs with GMB --- docs/src/GMB.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/src/GMB.md diff --git a/docs/src/GMB.md b/docs/src/GMB.md new file mode 100644 index 0000000..d02c33f --- /dev/null +++ b/docs/src/GMB.md @@ -0,0 +1,58 @@ +# GMB +The dataset an extract from GMB corpus which is tagged, annotated, +and built specifically to train the classifier to predict named entities such as name, location, etc. + +GMB is a fairly large corpus with a lot of annotations. +Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. +The corpus is created by using already existed annotators and then corrected by humans where needed. + + +```julia + +julia> corp = load(GMB()) +37789-element Array{Any,1}: + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Families"), NerOnlyTaggedWord("IN", "of"), +NerOnlyTaggedWord("NNS", "soldiers"), NerOnlyTaggedWord("VBN", "killed"), NerOnlyTaggedWord("IN", "in"), +NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "conflict"), NerOnlyTaggedWord("VBD", "joined"), +NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "protesters") … NerOnlyTaggedWord("CD", "One"), +NerOnlyTaggedWord("NN", "Terrorist"), NerOnlyTaggedWord("RQU", "\""), NerOnlyTaggedWord("CC", "and"), +NerOnlyTaggedWord("LQU", "\""), NerOnlyTaggedWord("VB", "Stop"), NerOnlyTaggedWord("DT", "the"), +NerOnlyTaggedWord("NNS", "Bombings"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")] + + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("PRP", "They"), NerOnlyTaggedWord("VBD", "marched"), +NerOnlyTaggedWord("IN", "from"), NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "Houses"), +NerOnlyTaggedWord("IN", "of"), NerOnlyTaggedWord("NN", "Parliament"), NerOnlyTaggedWord("TO", "to"), +NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "rally"), NerOnlyTaggedWord("IN", "in"), +NerOnlyTaggedWord("NNP", "Hyde"), NerOnlyTaggedWord("NNP", "Park"), NerOnlyTaggedWord(".", ".")] + + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Police"), NerOnlyTaggedWord("VBD", "put"), +NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "number"), NerOnlyTaggedWord("IN", "of"), +NerOnlyTaggedWord("NNS", "marchers"), NerOnlyTaggedWord("IN", "at"), NerOnlyTaggedWord("CD", "10,000"), +NerOnlyTaggedWord("IN", "while"), NerOnlyTaggedWord("NNS", "organizers"), NerOnlyTaggedWord("VBD", "claimed"), +NerOnlyTaggedWord("PRP", "it"), NerOnlyTaggedWord("VBD", "was"), NerOnlyTaggedWord("CD", "100,000"), +NerOnlyTaggedWord(".", ".")] + + ⋮ + + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("IN", "At"), NerOnlyTaggedWord("JJ", "last"), +NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goatherd"), NerOnlyTaggedWord("VBD", "threw"), +NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "stone"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("CC", +"and"), NerOnlyTaggedWord("VBG", "breaking") … NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("VBD", "begged"), +NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goat"), NerOnlyTaggedWord("RB", "not"), +NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "tell"), NerOnlyTaggedWord("PRP\$", "his"), +NerOnlyTaggedWord("NN", "master"), NerOnlyTaggedWord(".", ".")] + + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("DT", "The"), NerOnlyTaggedWord("NNP", "Goat"), + NerOnlyTaggedWord("VBD", "replied"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("LQU", "\""), + NerOnlyTaggedWord("WRB", "Why"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("PRP", "you"), +NerOnlyTaggedWord("JJ", "silly"), NerOnlyTaggedWord("NN", "fellow") … NerOnlyTaggedWord("DT", "the"), +NerOnlyTaggedWord("NN", "horn"), NerOnlyTaggedWord("MD", "will"), NerOnlyTaggedWord("VB", "speak"), +NerOnlyTaggedWord("IN", "though"), NerOnlyTaggedWord("PRP", "I"), NerOnlyTaggedWord("VB", "be"), +NerOnlyTaggedWord("JJ", "silent"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")] + + CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("VBP", "Do"), NerOnlyTaggedWord("RB", "not"), +NerOnlyTaggedWord("VB", "attempt"), NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "hide"), +NerOnlyTaggedWord("NNS", "things"), NerOnlyTaggedWord("WDT", "which"), NerOnlyTaggedWord("MD", "can"), +NerOnlyTaggedWord("RB", "not"), NerOnlyTaggedWord("VB", "be"), NerOnlyTaggedWord("JJ", "hid"), NerOnlyTaggedWord(".", ".")] + +``` From 64896b0253b73ebcd810995ed9c4dc0cb021e7f3 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 14 Mar 2020 05:54:21 +0530 Subject: [PATCH 11/16] updated make.jl --- docs/make.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index ed95e17..4e42ba3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -17,7 +17,8 @@ makedocs(modules = [CorpusLoaders], "Twitter" => "Twitter.md", "WikiCorpus" => "WikiCorpus.md", "WikiGold" => "WikiGold.md", - "API References" => "APIReference.md" + "API References" => "APIReference.md", + "GMB" => "GMB.md" ]) From 535b8f6dd46ec37a426caabee187f4222e92f9b1 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 14 Mar 2020 06:06:21 +0530 Subject: [PATCH 12/16] updating Read me --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b4ff181..2945c2b 100644 --- a/README.md +++ b/README.md @@ -40,3 +40,4 @@ Follow the links below for full docs on the usage of the corpora. - [IMDB movie reviews](docs/src/IMDB.md) - [Twitter sentiment dataset](docs/src/Twitter.md) - [Stanford Sentiment Treebank](docs/src/SST.md) + - [GMB](docs/src/GMB.md) From b322cf4538e81ffc1b55e9d7d36fd451308b9feb Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 14 Mar 2020 07:57:38 +0530 Subject: [PATCH 13/16] adding tests --- test/test_GMB.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 test/test_GMB.jl diff --git a/test/test_GMB.jl b/test/test_GMB.jl new file mode 100644 index 0000000..4e54bed --- /dev/null +++ b/test/test_GMB.jl @@ -0,0 +1,21 @@ +using CorpusLoaders +using Test +using Base.Iterators +using MultiResolutionIterators +using DataDeps + +@testset "Using flatten_levels" for path in [datadep"GMB 1.0.0", datadep"GMB 1.1.0", datadep"GMB 2.0.0", datadep"GMB 2.1.0", datadep"GMB 2.2.0"] + train = load(GMB()) + docs = train[1:5] + + words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word))) + @test length(words) > length(docs) + @test typeof(words) == Vector{CorpusLoaders.NerOnlyTaggedWord} + + plain_words = word.(words) + @test typeof(plain_words) <: Vector{String} + + ner_tags = named_entity.(words) + @test typeof(ner_tags) <: Vector{String} + +end From c96c3710ecdb2250098e5e1c3af831a81407dc4f Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 15 Mar 2020 00:39:40 +0530 Subject: [PATCH 14/16] updating docs --- docs/src/GMB.md | 91 ++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/docs/src/GMB.md b/docs/src/GMB.md index d02c33f..0e7c68a 100644 --- a/docs/src/GMB.md +++ b/docs/src/GMB.md @@ -6,53 +6,60 @@ GMB is a fairly large corpus with a lot of annotations. Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. The corpus is created by using already existed annotators and then corrected by humans where needed. +The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations. +The GMB is developed at the [University of Groningen](https://www.rug.nl/). + A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation. + +For more detail [refer](https://gmb.let.rug.nl/about.php) ```julia -julia> corp = load(GMB()) -37789-element Array{Any,1}: - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Families"), NerOnlyTaggedWord("IN", "of"), -NerOnlyTaggedWord("NNS", "soldiers"), NerOnlyTaggedWord("VBN", "killed"), NerOnlyTaggedWord("IN", "in"), -NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "conflict"), NerOnlyTaggedWord("VBD", "joined"), -NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "protesters") … NerOnlyTaggedWord("CD", "One"), -NerOnlyTaggedWord("NN", "Terrorist"), NerOnlyTaggedWord("RQU", "\""), NerOnlyTaggedWord("CC", "and"), -NerOnlyTaggedWord("LQU", "\""), NerOnlyTaggedWord("VB", "Stop"), NerOnlyTaggedWord("DT", "the"), -NerOnlyTaggedWord("NNS", "Bombings"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")] - - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("PRP", "They"), NerOnlyTaggedWord("VBD", "marched"), -NerOnlyTaggedWord("IN", "from"), NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "Houses"), -NerOnlyTaggedWord("IN", "of"), NerOnlyTaggedWord("NN", "Parliament"), NerOnlyTaggedWord("TO", "to"), -NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "rally"), NerOnlyTaggedWord("IN", "in"), -NerOnlyTaggedWord("NNP", "Hyde"), NerOnlyTaggedWord("NNP", "Park"), NerOnlyTaggedWord(".", ".")] - - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Police"), NerOnlyTaggedWord("VBD", "put"), -NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "number"), NerOnlyTaggedWord("IN", "of"), -NerOnlyTaggedWord("NNS", "marchers"), NerOnlyTaggedWord("IN", "at"), NerOnlyTaggedWord("CD", "10,000"), -NerOnlyTaggedWord("IN", "while"), NerOnlyTaggedWord("NNS", "organizers"), NerOnlyTaggedWord("VBD", "claimed"), -NerOnlyTaggedWord("PRP", "it"), NerOnlyTaggedWord("VBD", "was"), NerOnlyTaggedWord("CD", "100,000"), -NerOnlyTaggedWord(".", ".")] + Data= load(GMB()) +37789-element Array{Array{PosTaggedWord,1},1}: + [PosTaggedWord("NNS", "Families"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "soldiers"), PosTaggedWord("VBN", "killed"), PosTaggedWord("IN", "in"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "conflict"), PosTaggedWord("VBD", "joined"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "protesters") … PosTaggedWord("CD", "One"), PosTaggedWord("NN", "Terrorist"), PosTaggedWord("RQU", "\""), PosTaggedWord("CC", "and"), PosTaggedWord("LQU", "\""), PosTaggedWord("VB", "Stop"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Bombings"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")] + + [PosTaggedWord("PRP", "They"), PosTaggedWord("VBD", "marched"), PosTaggedWord("IN", "from"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Houses"), PosTaggedWord("IN", "of"), PosTaggedWord("NN", "Parliament"), PosTaggedWord("TO", "to"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "rally"), PosTaggedWord("IN", "in"), PosTaggedWord("NNP", "Hyde"), PosTaggedWord("NNP", "Park"), PosTaggedWord(".", ".")] + + [PosTaggedWord("NNS", "Police"), PosTaggedWord("VBD", "put"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "number"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "marchers"), PosTaggedWord("IN", "at"), PosTaggedWord("CD", "10,000"), PosTaggedWord("IN", "while"), PosTaggedWord("NNS", "organizers"), PosTaggedWord("VBD", "claimed"), PosTaggedWord("PRP", "it"), PosTaggedWord("VBD", "was"), PosTaggedWord("CD", "100,000"), PosTaggedWord(".", ".")] ⋮ + + [PosTaggedWord("IN", "At"), PosTaggedWord("JJ", "last"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goatherd"), PosTaggedWord("VBD", "threw"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "stone"), PosTaggedWord(",", ","), PosTaggedWord("CC", "and"), PosTaggedWord("VBG", "breaking") … PosTaggedWord(",", ","), PosTaggedWord("VBD", "begged"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("RB", "not"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "tell"), PosTaggedWord("PRP\$", "his"), PosTaggedWord("NN", "master"), PosTaggedWord(".", ".")] + + [PosTaggedWord("DT", "The"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("VBD", "replied"), PosTaggedWord(",", ","), PosTaggedWord("LQU", "\""), PosTaggedWord("WRB", "Why"), PosTaggedWord(",", ","), PosTaggedWord("PRP", "you"), PosTaggedWord("JJ", "silly"), PosTaggedWord("NN", "fellow") … PosTaggedWord("DT", "the"), PosTaggedWord("NN", "horn"), PosTaggedWord("MD", "will"), PosTaggedWord("VB", "speak"), PosTaggedWord("IN", "though"), PosTaggedWord("PRP", "I"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "silent"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")] + + [PosTaggedWord("VBP", "Do"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "attempt"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "hide"), PosTaggedWord("NNS", "things"), PosTaggedWord("WDT", "which"), PosTaggedWord("MD", "can"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "hid"), PosTaggedWord(".", ".")] - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("IN", "At"), NerOnlyTaggedWord("JJ", "last"), -NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goatherd"), NerOnlyTaggedWord("VBD", "threw"), -NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "stone"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("CC", -"and"), NerOnlyTaggedWord("VBG", "breaking") … NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("VBD", "begged"), -NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goat"), NerOnlyTaggedWord("RB", "not"), -NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "tell"), NerOnlyTaggedWord("PRP\$", "his"), -NerOnlyTaggedWord("NN", "master"), NerOnlyTaggedWord(".", ".")] - - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("DT", "The"), NerOnlyTaggedWord("NNP", "Goat"), - NerOnlyTaggedWord("VBD", "replied"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("LQU", "\""), - NerOnlyTaggedWord("WRB", "Why"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("PRP", "you"), -NerOnlyTaggedWord("JJ", "silly"), NerOnlyTaggedWord("NN", "fellow") … NerOnlyTaggedWord("DT", "the"), -NerOnlyTaggedWord("NN", "horn"), NerOnlyTaggedWord("MD", "will"), NerOnlyTaggedWord("VB", "speak"), -NerOnlyTaggedWord("IN", "though"), NerOnlyTaggedWord("PRP", "I"), NerOnlyTaggedWord("VB", "be"), -NerOnlyTaggedWord("JJ", "silent"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")] - - CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("VBP", "Do"), NerOnlyTaggedWord("RB", "not"), -NerOnlyTaggedWord("VB", "attempt"), NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "hide"), -NerOnlyTaggedWord("NNS", "things"), NerOnlyTaggedWord("WDT", "which"), NerOnlyTaggedWord("MD", "can"), -NerOnlyTaggedWord("RB", "not"), NerOnlyTaggedWord("VB", "be"), NerOnlyTaggedWord("JJ", "hid"), NerOnlyTaggedWord(".", ".")] +julia> Data[1] +30-element Array{PosTaggedWord,1}: + PosTaggedWord("NNS", "Families") + PosTaggedWord("IN", "of") + PosTaggedWord("NNS", "soldiers") + PosTaggedWord("VBN", "killed") + PosTaggedWord("IN", "in") + PosTaggedWord("DT", "the") + PosTaggedWord("NN", "conflict") + PosTaggedWord("VBD", "joined") + PosTaggedWord("DT", "the") + PosTaggedWord("NNS", "protesters") + PosTaggedWord("WP", "who") + PosTaggedWord("VBD", "carried") + PosTaggedWord("NNS", "banners") + PosTaggedWord("IN", "with") + ⋮ + PosTaggedWord("IN", "as") + PosTaggedWord("LQU", "\"") + PosTaggedWord("NNP", "Bush") + PosTaggedWord("NN", "Number") + PosTaggedWord("CD", "One") + PosTaggedWord("NN", "Terrorist") + PosTaggedWord("RQU", "\"") + PosTaggedWord("CC", "and") + PosTaggedWord("LQU", "\"") + PosTaggedWord("VB", "Stop") + PosTaggedWord("DT", "the") + PosTaggedWord("NNS", "Bombings") + PosTaggedWord(".", ".") + PosTaggedWord("LQU", "\"") ``` From c3bb93fb27afdedcd1be1038731054bfba4df2ca Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 15 Mar 2020 00:40:26 +0530 Subject: [PATCH 15/16] updated GMB.jl --- src/GMB.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/GMB.jl b/src/GMB.jl index 89707a4..c3faef2 100644 --- a/src/GMB.jl +++ b/src/GMB.jl @@ -4,11 +4,11 @@ end function GMB(dirpath) @assert(isdir(dirpath), dirpath) - paths=glob("data/*/*/en.tags",dirpath) + paths = glob("data/*/*/en.tags",dirpath) GMB(paths) end -GMB() = GMB(datadep"GMB 2.2.0") +GMB() = GMB(datadep"GMB 2.2.0") MultiResolutionIterators.levelname_map(::Type{GMB}) = [ :doc=>1, :contextfile=>1, :context=>1, :document=>1, @@ -20,38 +20,38 @@ MultiResolutionIterators.levelname_map(::Type{GMB}) = [ function parse_gmb_tagged_word(line::AbstractString) tokens_tags = split(line, '\t') - return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1]) + return PosTaggedWord(tokens_tags[2], tokens_tags[1]) end function parse_gmb(filename) - local sent=[] - sents = @NestedVector(NerOnlyTaggedWord, 2)() + local sent = [] + sents = @NestedVector(PosTaggedWord, 2)() function new_sentence() - sent = @NestedVector(NerOnlyTaggedWord, 1)() + sent = @NestedVector(PosTaggedWord, 1)() push!(sents, sent) end - + # words get_tagged(line) = push!(sent, parse_gmb_tagged_word(line)) # parse - for line in eachline(filename) + for line in eachline(filename) if length(line) == 0 new_sentence() else get_tagged(line) end - end + end return sents end function load(corpus::GMB) - ch=[] - for fn in corpus.filepath - document = parse_gmb(fn) + ch = @NestedVector(PosTaggedWord, 2)() + for fn in corpus.filepath + document = parse_gmb(fn) append!(ch, document) - end + end return(ch) end From 891d5c2d02aa3b09b897a1c6de061c9fb8d52874 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 15 Mar 2020 01:04:03 +0530 Subject: [PATCH 16/16] added about POS --- docs/src/GMB.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/GMB.md b/docs/src/GMB.md index 0e7c68a..caff1b8 100644 --- a/docs/src/GMB.md +++ b/docs/src/GMB.md @@ -10,6 +10,10 @@ The Groningen Meaning Bank (GMB) consists of public domain English texts with co The GMB is developed at the [University of Groningen](https://www.rug.nl/). A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation. +the part-of-speech tagset used in the Penn Treebank tagset as listed in Ann Taylor, Mitchell Marcus and Beatrice Santorini (2003): [The Penn Treebank](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.9.8216): An Overview, Section 1.1. + + + For more detail [refer](https://gmb.let.rug.nl/about.php) ```julia