From 3c86c51e52f2a7ec176bad4980ba284dae8293b3 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 8 Mar 2020 08:39:58 +0530
Subject: [PATCH 01/16] adding GMB.jl

---
 src/GMB.jl | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 src/GMB.jl

diff --git a/src/GMB.jl b/src/GMB.jl
new file mode 100644
index 0000000..e543ce6
--- /dev/null
+++ b/src/GMB.jl
@@ -0,0 +1,71 @@
+struct GMB{S}
+    filepath :: Vector{S}
+end
+
+function GMB(dirpath)
+    @assert(isdir(dirpath), dirpath)
+        paths=String[]
+            data_path = joinpath.(dirpath,"data")
+                for dir in readdir(data_path)
+                    for d in readdir(joinpath.(data_path,dir))
+                         if ("en.tags" in readdir(joinpath.(data_path,dir,d))) == true
+                          push!(paths,joinpath.(data_path,dir,d,"en.tags"))
+                         end
+                       
+                    end
+                end
+  
+GMB(paths)
+
+end
+
+GMB() = GMB(datadep"GMB 2.2.0") 
+
+MultiResolutionIterators.levelname_map(::Type{GMB}) = [
+    :doc=>1, :contextfile=>1, :context=>1, :document=>1,
+    :para=>2, :paragraph=>2,
+    :sent=>3, :sentence=>3,
+    :word=>4, :token=>4,
+    :char=>5, :character=>5
+    ]
+
+function parse_gmb_tagged_word(line::AbstractString)
+    tokens_tags = split(line,"	")
+    return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1])
+end
+
+function parse_gmb(filename)
+  local sent=[]
+
+    sents = @NestedVector(NerOnlyTaggedWord, 2)()
+    context = Document(intern(basename(filename)), sents)
+
+    function new_sentence()
+        sent = @NestedVector(NerOnlyTaggedWord,1)()
+        push!(sents, sent)
+    end
+    
+
+    # words
+    get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))
+
+    # parse
+  
+for line in eachline(filename)
+        if length(line) == 0
+            new_sentence()
+        else
+            get_tagged(line)
+        end
+    end
+    return context
+end
+
+function load(corpus::GMB)
+    ch=[]
+        for fn in corpus.filepath
+            document = parse_gmb(fn)
+           append!(ch, document)
+        end
+ return(ch)
+end

From 18b393914fa7fb2621b56fa110264aa2fb8355e5 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 8 Mar 2020 08:41:10 +0530
Subject: [PATCH 02/16] adding GMB_DataDeps

---
 src/GMB_DataDeps.jl | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 src/GMB_DataDeps.jl

diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl
new file mode 100644
index 0000000..9136a84
--- /dev/null
+++ b/src/GMB_DataDeps.jl
@@ -0,0 +1,33 @@
+using DataDeps
+
+
+for (ver, checksum) in [("1.0.0", "16814254fe194d55a2fcc24858aa76d71de3c49e495bd98478cc7345e766d8b7"),
+            ("1.1.0", "0495577ac3a87c2a64fe6189798ea046de0f44943dfb7b60fe38cf648d34c421"),
+            ("2.0.0", "70b9eb7ca0dc9d67655f9d671d40be10aeff490f0bea4f10cb1946127b74c102"),
+            ("2.1.0", "93fbae725f0125dedb7369403fda1dace85b2dcd8a523ed80af23e863b18ef2c"),
+            ("2.2.0", "0714f07dbcb84a215d668f3ee85892fa8fa4a8154439662eb7529413367b8f56")]
+
+    register(DataDep("GMB $ver",
+        """
+        Website: https://gmb.let.rug.nl/data.php
+        Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes
+        
+        The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
+        The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank. 
+        A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
+
+        Please cite the following publication if you use the corpora:
+        Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496."
+        """,
+        "https://gmb.let.rug.nl/releases/gmb-$(ver).zip",
+        checksum;
+        post_fetch_method = fn -> begin
+            unpack(fn)
+            innerdir = "gbm-$(ver)"
+            innerfiles = readdir(innerdir)
+            # Move everything to current directory, under same name
+            mv.(joinpath.(innerdir, innerfiles), innerfiles)
+            rm(innerdir)
+        end
+    ))
+end

From 45f0867f73514de2b1f2dafbcf7f533e8babd21c Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 8 Mar 2020 09:46:41 +0530
Subject: [PATCH 03/16] checksumupdate

---
 src/GMB_DataDeps.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl
index 9136a84..62e4101 100644
--- a/src/GMB_DataDeps.jl
+++ b/src/GMB_DataDeps.jl
@@ -1,11 +1,11 @@
 using DataDeps
 
 
-for (ver, checksum) in [("1.0.0", "16814254fe194d55a2fcc24858aa76d71de3c49e495bd98478cc7345e766d8b7"),
-            ("1.1.0", "0495577ac3a87c2a64fe6189798ea046de0f44943dfb7b60fe38cf648d34c421"),
-            ("2.0.0", "70b9eb7ca0dc9d67655f9d671d40be10aeff490f0bea4f10cb1946127b74c102"),
-            ("2.1.0", "93fbae725f0125dedb7369403fda1dace85b2dcd8a523ed80af23e863b18ef2c"),
-            ("2.2.0", "0714f07dbcb84a215d668f3ee85892fa8fa4a8154439662eb7529413367b8f56")]
+for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"),
+            ("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"),
+            ("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"),
+            ("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"),
+            ("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")]
 
     register(DataDep("GMB $ver",
         """

From c21c4dd551765bbf7567fc0382c8942f8bc180b3 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 8 Mar 2020 10:16:21 +0530
Subject: [PATCH 04/16] updating module

---
 src/CorpusLoaders.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl
index 07cf516..31c1a6c 100644
--- a/src/CorpusLoaders.jl
+++ b/src/CorpusLoaders.jl
@@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord
 export title, sensekey, word, named_entity, part_of_speech
 export load
 
-export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000
+export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB
 
 function __init__()
     include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl"))
@@ -24,6 +24,7 @@ function __init__()
     include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl"))
     include(joinpath(@__DIR__, "WikiGold_DataDeps.jl"))
     include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl"))
+    include(joinpath(@__DIR__, "GMB_DataDeps.jl"))
 end
 
 include("types.jl")
@@ -38,5 +39,5 @@ include("Twitter.jl")
 include("StanfordSentimentTreebank.jl")
 include("WikiGold.jl")
 include("CoNLL2000.jl")
-
+include("GMB.jl")
 end

From 65b49f192efe970a5f3fdcc3cda6ff9f87b9525a Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Wed, 11 Mar 2020 20:52:44 +0530
Subject: [PATCH 05/16] Update src/GMB.jl

Co-Authored-By: Lyndon White <oxinabox@ucc.asn.au>
---
 src/GMB.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GMB.jl b/src/GMB.jl
index e543ce6..a2f9636 100644
--- a/src/GMB.jl
+++ b/src/GMB.jl
@@ -8,7 +8,7 @@ function GMB(dirpath)
             data_path = joinpath.(dirpath,"data")
                 for dir in readdir(data_path)
                     for d in readdir(joinpath.(data_path,dir))
-                         if ("en.tags" in readdir(joinpath.(data_path,dir,d))) == true
+                         if ispath(joinpath(data_path, dir, d, "en.tags")))
                           push!(paths,joinpath.(data_path,dir,d,"en.tags"))
                          end
                        

From d78f5be90681afd46485499b357e840283579e81 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Wed, 11 Mar 2020 20:53:57 +0530
Subject: [PATCH 06/16] Update src/GMB.jl

Co-Authored-By: Lyndon White <oxinabox@ucc.asn.au>
---
 src/GMB.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GMB.jl b/src/GMB.jl
index a2f9636..c79a485 100644
--- a/src/GMB.jl
+++ b/src/GMB.jl
@@ -41,7 +41,7 @@ function parse_gmb(filename)
     context = Document(intern(basename(filename)), sents)
 
     function new_sentence()
-        sent = @NestedVector(NerOnlyTaggedWord,1)()
+        sent = @NestedVector(NerOnlyTaggedWord, 1)()
         push!(sents, sent)
     end
     

From 0ebe1e85ce6501cd70f7b46f2af31f0900563bb7 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Wed, 11 Mar 2020 21:08:41 +0530
Subject: [PATCH 07/16] Update src/GMB.jl

Co-Authored-By: Lyndon White <oxinabox@ucc.asn.au>
---
 src/GMB.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GMB.jl b/src/GMB.jl
index c79a485..58f5ab5 100644
--- a/src/GMB.jl
+++ b/src/GMB.jl
@@ -30,7 +30,7 @@ MultiResolutionIterators.levelname_map(::Type{GMB}) = [
     ]
 
 function parse_gmb_tagged_word(line::AbstractString)
-    tokens_tags = split(line,"	")
+    tokens_tags = split(line, '\t')
     return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1])
 end
 

From 35fda3bd60c71e9c12e76aa67aeeb7e3c190d060 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Fri, 13 Mar 2020 19:33:47 +0530
Subject: [PATCH 08/16] Update src/GMB_DataDeps.jl

Co-Authored-By: Lyndon White <oxinabox@ucc.asn.au>
---
 src/GMB_DataDeps.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl
index 62e4101..44228ac 100644
--- a/src/GMB_DataDeps.jl
+++ b/src/GMB_DataDeps.jl
@@ -21,7 +21,7 @@ for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f224259
         """,
         "https://gmb.let.rug.nl/releases/gmb-$(ver).zip",
         checksum;
-        post_fetch_method = fn -> begin
+        post_fetch_method = function (fn)
             unpack(fn)
             innerdir = "gbm-$(ver)"
             innerfiles = readdir(innerdir)

From 2bb03e8c64380d8f76d8a48eb2275f123728706e Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 14 Mar 2020 05:18:23 +0530
Subject: [PATCH 09/16] indentation correction with use of glob

---
 src/GMB.jl | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/GMB.jl b/src/GMB.jl
index 58f5ab5..89707a4 100644
--- a/src/GMB.jl
+++ b/src/GMB.jl
@@ -4,19 +4,8 @@ end
 
 function GMB(dirpath)
     @assert(isdir(dirpath), dirpath)
-        paths=String[]
-            data_path = joinpath.(dirpath,"data")
-                for dir in readdir(data_path)
-                    for d in readdir(joinpath.(data_path,dir))
-                         if ispath(joinpath(data_path, dir, d, "en.tags")))
-                          push!(paths,joinpath.(data_path,dir,d,"en.tags"))
-                         end
-                       
-                    end
-                end
-  
-GMB(paths)
-
+    paths=glob("data/*/*/en.tags",dirpath)
+    GMB(paths)
 end
 
 GMB() = GMB(datadep"GMB 2.2.0") 
@@ -35,10 +24,8 @@ function parse_gmb_tagged_word(line::AbstractString)
 end
 
 function parse_gmb(filename)
-  local sent=[]
-
-    sents = @NestedVector(NerOnlyTaggedWord, 2)()
-    context = Document(intern(basename(filename)), sents)
+    local sent=[]
+	sents = @NestedVector(NerOnlyTaggedWord, 2)()
 
     function new_sentence()
         sent = @NestedVector(NerOnlyTaggedWord, 1)()
@@ -50,22 +37,21 @@ function parse_gmb(filename)
     get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))
 
     # parse
-  
-for line in eachline(filename)
+	for line in eachline(filename)
         if length(line) == 0
             new_sentence()
         else
             get_tagged(line)
         end
-    end
-    return context
+	end
+    return sents
 end
 
 function load(corpus::GMB)
-    ch=[]
-        for fn in corpus.filepath
-            document = parse_gmb(fn)
-           append!(ch, document)
-        end
- return(ch)
+	ch=[]
+	for fn in corpus.filepath
+    	document = parse_gmb(fn)
+        append!(ch, document)
+	end
+    return(ch)
 end

From 9fa1deb3ec05dd7aab610a8a27a4702c2bca5306 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 14 Mar 2020 05:53:35 +0530
Subject: [PATCH 10/16] updated_docs with GMB

---
 docs/src/GMB.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 docs/src/GMB.md

diff --git a/docs/src/GMB.md b/docs/src/GMB.md
new file mode 100644
index 0000000..d02c33f
--- /dev/null
+++ b/docs/src/GMB.md
@@ -0,0 +1,58 @@
+# GMB
+The dataset an extract from GMB corpus which is tagged, annotated, 
+and built specifically to train the classifier to predict named entities such as name, location, etc. 
+
+GMB is a fairly large corpus with a lot of annotations.
+Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. 
+The corpus is created by using already existed annotators and then corrected by humans where needed.
+
+
+```julia
+
+julia> corp = load(GMB())
+37789-element Array{Any,1}:
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Families"), NerOnlyTaggedWord("IN", "of"), 
+NerOnlyTaggedWord("NNS", "soldiers"), NerOnlyTaggedWord("VBN", "killed"), NerOnlyTaggedWord("IN", "in"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "conflict"), NerOnlyTaggedWord("VBD", "joined"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "protesters")  …  NerOnlyTaggedWord("CD", "One"), 
+NerOnlyTaggedWord("NN", "Terrorist"), NerOnlyTaggedWord("RQU", "\""), NerOnlyTaggedWord("CC", "and"), 
+NerOnlyTaggedWord("LQU", "\""), NerOnlyTaggedWord("VB", "Stop"), NerOnlyTaggedWord("DT", "the"), 
+NerOnlyTaggedWord("NNS", "Bombings"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("PRP", "They"), NerOnlyTaggedWord("VBD", "marched"), 
+NerOnlyTaggedWord("IN", "from"), NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "Houses"), 
+NerOnlyTaggedWord("IN", "of"), NerOnlyTaggedWord("NN", "Parliament"), NerOnlyTaggedWord("TO", "to"), 
+NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "rally"), NerOnlyTaggedWord("IN", "in"), 
+NerOnlyTaggedWord("NNP", "Hyde"), NerOnlyTaggedWord("NNP", "Park"), NerOnlyTaggedWord(".", ".")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Police"), NerOnlyTaggedWord("VBD", "put"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "number"), NerOnlyTaggedWord("IN", "of"), 
+NerOnlyTaggedWord("NNS", "marchers"), NerOnlyTaggedWord("IN", "at"), NerOnlyTaggedWord("CD", "10,000"), 
+NerOnlyTaggedWord("IN", "while"), NerOnlyTaggedWord("NNS", "organizers"), NerOnlyTaggedWord("VBD", "claimed"), 
+NerOnlyTaggedWord("PRP", "it"), NerOnlyTaggedWord("VBD", "was"), NerOnlyTaggedWord("CD", "100,000"), 
+NerOnlyTaggedWord(".", ".")]
+
+  ⋮
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("IN", "At"), NerOnlyTaggedWord("JJ", "last"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goatherd"), NerOnlyTaggedWord("VBD", "threw"), 
+NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "stone"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("CC", 
+"and"), NerOnlyTaggedWord("VBG", "breaking")  …  NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("VBD", "begged"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goat"), NerOnlyTaggedWord("RB", "not"), 
+NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "tell"), NerOnlyTaggedWord("PRP\$", "his"), 
+NerOnlyTaggedWord("NN", "master"), NerOnlyTaggedWord(".", ".")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("DT", "The"), NerOnlyTaggedWord("NNP", "Goat"),
+ NerOnlyTaggedWord("VBD", "replied"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("LQU", "\""),
+ NerOnlyTaggedWord("WRB", "Why"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("PRP", "you"), 
+NerOnlyTaggedWord("JJ", "silly"), NerOnlyTaggedWord("NN", "fellow")  …  NerOnlyTaggedWord("DT", "the"), 
+NerOnlyTaggedWord("NN", "horn"), NerOnlyTaggedWord("MD", "will"), NerOnlyTaggedWord("VB", "speak"), 
+NerOnlyTaggedWord("IN", "though"), NerOnlyTaggedWord("PRP", "I"), NerOnlyTaggedWord("VB", "be"), 
+NerOnlyTaggedWord("JJ", "silent"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("VBP", "Do"), NerOnlyTaggedWord("RB", "not"), 
+NerOnlyTaggedWord("VB", "attempt"), NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "hide"), 
+NerOnlyTaggedWord("NNS", "things"), NerOnlyTaggedWord("WDT", "which"), NerOnlyTaggedWord("MD", "can"), 
+NerOnlyTaggedWord("RB", "not"), NerOnlyTaggedWord("VB", "be"), NerOnlyTaggedWord("JJ", "hid"), NerOnlyTaggedWord(".", ".")]
+
+```

From 64896b0253b73ebcd810995ed9c4dc0cb021e7f3 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 14 Mar 2020 05:54:21 +0530
Subject: [PATCH 11/16] updated make.jl

---
 docs/make.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index ed95e17..4e42ba3 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -17,7 +17,8 @@ makedocs(modules = [CorpusLoaders],
              "Twitter" => "Twitter.md",
              "WikiCorpus" => "WikiCorpus.md",
              "WikiGold" => "WikiGold.md",
-             "API References" => "APIReference.md"
+             "API References" => "APIReference.md",
+             "GMB" => "GMB.md"
         ])
 
 

From 535b8f6dd46ec37a426caabee187f4222e92f9b1 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 14 Mar 2020 06:06:21 +0530
Subject: [PATCH 12/16] updating Read me

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b4ff181..2945c2b 100644
--- a/README.md
+++ b/README.md
@@ -40,3 +40,4 @@ Follow the links below for full docs on the usage of the corpora.
  - [IMDB movie reviews](docs/src/IMDB.md)
  - [Twitter sentiment dataset](docs/src/Twitter.md)
  - [Stanford Sentiment Treebank](docs/src/SST.md)
+ - [GMB](docs/src/GMB.md)

From b322cf4538e81ffc1b55e9d7d36fd451308b9feb Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 14 Mar 2020 07:57:38 +0530
Subject: [PATCH 13/16] adding tests

---
 test/test_GMB.jl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 test/test_GMB.jl

diff --git a/test/test_GMB.jl b/test/test_GMB.jl
new file mode 100644
index 0000000..4e54bed
--- /dev/null
+++ b/test/test_GMB.jl
@@ -0,0 +1,21 @@
+using CorpusLoaders
+using Test
+using Base.Iterators
+using MultiResolutionIterators
+using DataDeps
+
+@testset "Using flatten_levels" for path in [datadep"GMB 1.0.0", datadep"GMB 1.1.0", datadep"GMB 2.0.0", datadep"GMB 2.1.0", datadep"GMB 2.2.0"]
+    train = load(GMB())
+    docs = train[1:5]
+
+    words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word)))
+    @test length(words) > length(docs)
+    @test typeof(words) == Vector{CorpusLoaders.NerOnlyTaggedWord}
+
+    plain_words = word.(words)
+    @test typeof(plain_words) <: Vector{String}
+
+    ner_tags = named_entity.(words)
+    @test typeof(ner_tags) <: Vector{String}
+
+end

From c96c3710ecdb2250098e5e1c3af831a81407dc4f Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 15 Mar 2020 00:39:40 +0530
Subject: [PATCH 14/16] updating docs

---
 docs/src/GMB.md | 91 ++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/docs/src/GMB.md b/docs/src/GMB.md
index d02c33f..0e7c68a 100644
--- a/docs/src/GMB.md
+++ b/docs/src/GMB.md
@@ -6,53 +6,60 @@ GMB is a fairly large corpus with a lot of annotations.
 Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. 
 The corpus is created by using already existed annotators and then corrected by humans where needed.
 
+The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
+The GMB is developed at the [University of Groningen](https://www.rug.nl/).
+ A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
+
+For more detail [refer](https://gmb.let.rug.nl/about.php)
 
 ```julia
 
-julia> corp = load(GMB())
-37789-element Array{Any,1}:
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Families"), NerOnlyTaggedWord("IN", "of"), 
-NerOnlyTaggedWord("NNS", "soldiers"), NerOnlyTaggedWord("VBN", "killed"), NerOnlyTaggedWord("IN", "in"), 
-NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "conflict"), NerOnlyTaggedWord("VBD", "joined"), 
-NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "protesters")  …  NerOnlyTaggedWord("CD", "One"), 
-NerOnlyTaggedWord("NN", "Terrorist"), NerOnlyTaggedWord("RQU", "\""), NerOnlyTaggedWord("CC", "and"), 
-NerOnlyTaggedWord("LQU", "\""), NerOnlyTaggedWord("VB", "Stop"), NerOnlyTaggedWord("DT", "the"), 
-NerOnlyTaggedWord("NNS", "Bombings"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
-
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("PRP", "They"), NerOnlyTaggedWord("VBD", "marched"), 
-NerOnlyTaggedWord("IN", "from"), NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "Houses"), 
-NerOnlyTaggedWord("IN", "of"), NerOnlyTaggedWord("NN", "Parliament"), NerOnlyTaggedWord("TO", "to"), 
-NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "rally"), NerOnlyTaggedWord("IN", "in"), 
-NerOnlyTaggedWord("NNP", "Hyde"), NerOnlyTaggedWord("NNP", "Park"), NerOnlyTaggedWord(".", ".")]
-
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Police"), NerOnlyTaggedWord("VBD", "put"), 
-NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "number"), NerOnlyTaggedWord("IN", "of"), 
-NerOnlyTaggedWord("NNS", "marchers"), NerOnlyTaggedWord("IN", "at"), NerOnlyTaggedWord("CD", "10,000"), 
-NerOnlyTaggedWord("IN", "while"), NerOnlyTaggedWord("NNS", "organizers"), NerOnlyTaggedWord("VBD", "claimed"), 
-NerOnlyTaggedWord("PRP", "it"), NerOnlyTaggedWord("VBD", "was"), NerOnlyTaggedWord("CD", "100,000"), 
-NerOnlyTaggedWord(".", ".")]
+ Data= load(GMB())
+37789-element Array{Array{PosTaggedWord,1},1}:
+ [PosTaggedWord("NNS", "Families"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "soldiers"), PosTaggedWord("VBN", "killed"), PosTaggedWord("IN", "in"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "conflict"), PosTaggedWord("VBD", "joined"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "protesters")  …  PosTaggedWord("CD", "One"), PosTaggedWord("NN", "Terrorist"), PosTaggedWord("RQU", "\""), PosTaggedWord("CC", "and"), PosTaggedWord("LQU", "\""), PosTaggedWord("VB", "Stop"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Bombings"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")]
+
+ [PosTaggedWord("PRP", "They"), PosTaggedWord("VBD", "marched"), PosTaggedWord("IN", "from"), PosTaggedWord("DT", "the"), PosTaggedWord("NNS", "Houses"), PosTaggedWord("IN", "of"), PosTaggedWord("NN", "Parliament"), PosTaggedWord("TO", "to"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "rally"), PosTaggedWord("IN", "in"), PosTaggedWord("NNP", "Hyde"), PosTaggedWord("NNP", "Park"), PosTaggedWord(".", ".")]
+
+ [PosTaggedWord("NNS", "Police"), PosTaggedWord("VBD", "put"), PosTaggedWord("DT", "the"), PosTaggedWord("NN", "number"), PosTaggedWord("IN", "of"), PosTaggedWord("NNS", "marchers"), PosTaggedWord("IN", "at"), PosTaggedWord("CD", "10,000"), PosTaggedWord("IN", "while"), PosTaggedWord("NNS", "organizers"), PosTaggedWord("VBD", "claimed"), PosTaggedWord("PRP", "it"), PosTaggedWord("VBD", "was"), PosTaggedWord("CD", "100,000"), PosTaggedWord(".", ".")]
 
   ⋮
+                                                                            
+ [PosTaggedWord("IN", "At"), PosTaggedWord("JJ", "last"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goatherd"), PosTaggedWord("VBD", "threw"), PosTaggedWord("DT", "a"), PosTaggedWord("NN", "stone"), PosTaggedWord(",", ","), PosTaggedWord("CC", "and"), PosTaggedWord("VBG", "breaking")  …  PosTaggedWord(",", ","), PosTaggedWord("VBD", "begged"), PosTaggedWord("DT", "the"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("RB", "not"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "tell"), PosTaggedWord("PRP\$", "his"), PosTaggedWord("NN", "master"), PosTaggedWord(".", ".")]
+
+ [PosTaggedWord("DT", "The"), PosTaggedWord("NNP", "Goat"), PosTaggedWord("VBD", "replied"), PosTaggedWord(",", ","), PosTaggedWord("LQU", "\""), PosTaggedWord("WRB", "Why"), PosTaggedWord(",", ","), PosTaggedWord("PRP", "you"), PosTaggedWord("JJ", "silly"), PosTaggedWord("NN", "fellow")  …  PosTaggedWord("DT", "the"), PosTaggedWord("NN", "horn"), PosTaggedWord("MD", "will"), PosTaggedWord("VB", "speak"), PosTaggedWord("IN", "though"), PosTaggedWord("PRP", "I"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "silent"), PosTaggedWord(".", "."), PosTaggedWord("LQU", "\"")]
+
+ [PosTaggedWord("VBP", "Do"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "attempt"), PosTaggedWord("TO", "to"), PosTaggedWord("VB", "hide"), PosTaggedWord("NNS", "things"), PosTaggedWord("WDT", "which"), PosTaggedWord("MD", "can"), PosTaggedWord("RB", "not"), PosTaggedWord("VB", "be"), PosTaggedWord("JJ", "hid"), PosTaggedWord(".", ".")] 
 
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("IN", "At"), NerOnlyTaggedWord("JJ", "last"), 
-NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goatherd"), NerOnlyTaggedWord("VBD", "threw"), 
-NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "stone"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("CC", 
-"and"), NerOnlyTaggedWord("VBG", "breaking")  …  NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("VBD", "begged"), 
-NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goat"), NerOnlyTaggedWord("RB", "not"), 
-NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "tell"), NerOnlyTaggedWord("PRP\$", "his"), 
-NerOnlyTaggedWord("NN", "master"), NerOnlyTaggedWord(".", ".")]
-
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("DT", "The"), NerOnlyTaggedWord("NNP", "Goat"),
- NerOnlyTaggedWord("VBD", "replied"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("LQU", "\""),
- NerOnlyTaggedWord("WRB", "Why"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("PRP", "you"), 
-NerOnlyTaggedWord("JJ", "silly"), NerOnlyTaggedWord("NN", "fellow")  …  NerOnlyTaggedWord("DT", "the"), 
-NerOnlyTaggedWord("NN", "horn"), NerOnlyTaggedWord("MD", "will"), NerOnlyTaggedWord("VB", "speak"), 
-NerOnlyTaggedWord("IN", "though"), NerOnlyTaggedWord("PRP", "I"), NerOnlyTaggedWord("VB", "be"), 
-NerOnlyTaggedWord("JJ", "silent"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
-
- CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("VBP", "Do"), NerOnlyTaggedWord("RB", "not"), 
-NerOnlyTaggedWord("VB", "attempt"), NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "hide"), 
-NerOnlyTaggedWord("NNS", "things"), NerOnlyTaggedWord("WDT", "which"), NerOnlyTaggedWord("MD", "can"), 
-NerOnlyTaggedWord("RB", "not"), NerOnlyTaggedWord("VB", "be"), NerOnlyTaggedWord("JJ", "hid"), NerOnlyTaggedWord(".", ".")]
+julia> Data[1]
+30-element Array{PosTaggedWord,1}:
+ PosTaggedWord("NNS", "Families")
+ PosTaggedWord("IN", "of")
+ PosTaggedWord("NNS", "soldiers")
+ PosTaggedWord("VBN", "killed")
+ PosTaggedWord("IN", "in")
+ PosTaggedWord("DT", "the")
+ PosTaggedWord("NN", "conflict")
+ PosTaggedWord("VBD", "joined")
+ PosTaggedWord("DT", "the")
+ PosTaggedWord("NNS", "protesters")
+ PosTaggedWord("WP", "who")
+ PosTaggedWord("VBD", "carried")
+ PosTaggedWord("NNS", "banners") 
+ PosTaggedWord("IN", "with") 
+ ⋮
+ PosTaggedWord("IN", "as")
+ PosTaggedWord("LQU", "\"")
+ PosTaggedWord("NNP", "Bush")
+ PosTaggedWord("NN", "Number")
+ PosTaggedWord("CD", "One") 
+ PosTaggedWord("NN", "Terrorist")
+ PosTaggedWord("RQU", "\"") 
+ PosTaggedWord("CC", "and") 
+ PosTaggedWord("LQU", "\"") 
+ PosTaggedWord("VB", "Stop")
+ PosTaggedWord("DT", "the") 
+ PosTaggedWord("NNS", "Bombings")
+ PosTaggedWord(".", ".") 
+ PosTaggedWord("LQU", "\"") 
 
 ```

From c3bb93fb27afdedcd1be1038731054bfba4df2ca Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 15 Mar 2020 00:40:26 +0530
Subject: [PATCH 15/16] updated GMB.jl

---
 src/GMB.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/GMB.jl b/src/GMB.jl
index 89707a4..c3faef2 100644
--- a/src/GMB.jl
+++ b/src/GMB.jl
@@ -4,11 +4,11 @@ end
 
 function GMB(dirpath)
     @assert(isdir(dirpath), dirpath)
-    paths=glob("data/*/*/en.tags",dirpath)
+    paths = glob("data/*/*/en.tags",dirpath)
     GMB(paths)
 end
 
-GMB() = GMB(datadep"GMB 2.2.0") 
+GMB() = GMB(datadep"GMB 2.2.0")
 
 MultiResolutionIterators.levelname_map(::Type{GMB}) = [
     :doc=>1, :contextfile=>1, :context=>1, :document=>1,
@@ -20,38 +20,38 @@ MultiResolutionIterators.levelname_map(::Type{GMB}) = [
 
 function parse_gmb_tagged_word(line::AbstractString)
     tokens_tags = split(line, '\t')
-    return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1])
+    return PosTaggedWord(tokens_tags[2], tokens_tags[1])
 end
 
 function parse_gmb(filename)
-    local sent=[]
-	sents = @NestedVector(NerOnlyTaggedWord, 2)()
+    local sent = []
+    sents = @NestedVector(PosTaggedWord, 2)()
 
     function new_sentence()
-        sent = @NestedVector(NerOnlyTaggedWord, 1)()
+        sent = @NestedVector(PosTaggedWord, 1)()
         push!(sents, sent)
     end
-    
+
 
     # words
     get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))
 
     # parse
-	for line in eachline(filename)
+    for line in eachline(filename)
         if length(line) == 0
             new_sentence()
         else
             get_tagged(line)
         end
-	end
+    end
     return sents
 end
 
 function load(corpus::GMB)
-	ch=[]
-	for fn in corpus.filepath
-    	document = parse_gmb(fn)
+    ch = @NestedVector(PosTaggedWord, 2)()
+    for fn in corpus.filepath
+        document = parse_gmb(fn)
         append!(ch, document)
-	end
+    end
     return(ch)
 end

From 891d5c2d02aa3b09b897a1c6de061c9fb8d52874 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 15 Mar 2020 01:04:03 +0530
Subject: [PATCH 16/16] added about POS

---
 docs/src/GMB.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/src/GMB.md b/docs/src/GMB.md
index 0e7c68a..caff1b8 100644
--- a/docs/src/GMB.md
+++ b/docs/src/GMB.md
@@ -10,6 +10,10 @@ The Groningen Meaning Bank (GMB) consists of public domain English texts with co
 The GMB is developed at the [University of Groningen](https://www.rug.nl/).
  A multi-lingual version of the GMB is the [Parallel Meaning Bank](https://pmb.let.rug.nl/). A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
 
+the part-of-speech tagset used in the Penn Treebank tagset as listed in Ann Taylor, Mitchell Marcus and Beatrice Santorini (2003): [The Penn Treebank](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.9.8216): An Overview, Section 1.1.
+
+
+
 For more detail [refer](https://gmb.let.rug.nl/about.php)
 
 ```julia