Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding gmb dataset #39

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
5 changes: 3 additions & 2 deletions src/CorpusLoaders.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord
export title, sensekey, word, named_entity, part_of_speech
export load

export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000
export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB

function __init__()
include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl"))
Expand All @@ -24,6 +24,7 @@ function __init__()
include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl"))
include(joinpath(@__DIR__, "WikiGold_DataDeps.jl"))
include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl"))
include(joinpath(@__DIR__, "GMB_DataDeps.jl"))
end

include("types.jl")
Expand All @@ -38,5 +39,5 @@ include("Twitter.jl")
include("StanfordSentimentTreebank.jl")
include("WikiGold.jl")
include("CoNLL2000.jl")

include("GMB.jl")
end
71 changes: 71 additions & 0 deletions src/GMB.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
struct GMB{S}
filepath :: Vector{S}
end

function GMB(dirpath)
oxinabox marked this conversation as resolved.
Show resolved Hide resolved
@assert(isdir(dirpath), dirpath)
paths=String[]
data_path = joinpath.(dirpath,"data")
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
for dir in readdir(data_path)
for d in readdir(joinpath.(data_path,dir))
if ("en.tags" in readdir(joinpath.(data_path,dir,d))) == true
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
push!(paths,joinpath.(data_path,dir,d,"en.tags"))
end

end
end
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved

GMB(paths)

end

GMB() = GMB(datadep"GMB 2.2.0")

MultiResolutionIterators.levelname_map(::Type{GMB}) = [
:doc=>1, :contextfile=>1, :context=>1, :document=>1,
:para=>2, :paragraph=>2,
:sent=>3, :sentence=>3,
:word=>4, :token=>4,
:char=>5, :character=>5
]

function parse_gmb_tagged_word(line::AbstractString)
tokens_tags = split(line," ")
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1])
end

function parse_gmb(filename)
local sent=[]
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved

sents = @NestedVector(NerOnlyTaggedWord, 2)()
context = Document(intern(basename(filename)), sents)
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved

function new_sentence()
sent = @NestedVector(NerOnlyTaggedWord,1)()
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
push!(sents, sent)
end


# words
get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))

# parse

for line in eachline(filename)
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
if length(line) == 0
new_sentence()
else
get_tagged(line)
end
end
return context
end

function load(corpus::GMB)
ch=[]
for fn in corpus.filepath
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
document = parse_gmb(fn)
append!(ch, document)
end
return(ch)
end
33 changes: 33 additions & 0 deletions src/GMB_DataDeps.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
using DataDeps


for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"),
("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"),
("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"),
("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"),
("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")]

register(DataDep("GMB $ver",
"""
Website: https://gmb.let.rug.nl/data.php
Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes

The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank.
A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.

Please cite the following publication if you use the corpora:
Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496."
""",
"https://gmb.let.rug.nl/releases/gmb-$(ver).zip",
checksum;
post_fetch_method = fn -> begin
tejasvaidhyadev marked this conversation as resolved.
Show resolved Hide resolved
unpack(fn)
innerdir = "gbm-$(ver)"
innerfiles = readdir(innerdir)
# Move everything to current directory, under same name
mv.(joinpath.(innerdir, innerfiles), innerfiles)
rm(innerdir)
end
))
end