Merge pull request #7 from svilupp/update-docs-and-types

[DRAFT] Update docstrings and allow AbstractString
astariul · Jul 27, 2024 · 0468142 · 0468142
2 parents 9a0b39a + 2f02819
commit 0468142
Show file tree

Hide file tree

Showing 7 changed files with 207 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -2,11 +2,11 @@
 
 [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://astariul.github.io/Sentencize.jl/stable/) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://astariul.github.io/Sentencize.jl/dev/) [![Build Status](https://github.com/astariul/Sentencize.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/astariul/Sentencize.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/astariul/Sentencize.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/astariul/Sentencize.jl) [![Aqua](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
 
-**Text to sentence splitter using heuristic algorithm.**
+**Text to sentence splitter using a heuristic algorithm.**
 
-This module is a port of the [python package `sentence-splitter`](https://github.com/berkmancenter/mediacloud-sentence-splitter).
+This module is a port of the [Python package `sentence-splitter`](https://github.com/berkmancenter/mediacloud-sentence-splitter).
 
-The module allows splitting of text paragraphs into sentences. It is based on scripts developed by Philipp Koehn and Josh Schroeder for processing the [Europarl corpus](http://www.statmt.org/europarl/).
+The module allows the splitting of text paragraphs into sentences. It is based on scripts developed by Philipp Koehn and Josh Schroeder for processing the [Europarl corpus](http://www.statmt.org/europarl/).
 
 ## Usage
 
@@ -31,7 +31,7 @@ println(sen)
 You can specify your own non-breaking prefixes file:
 
 ```julia
-sen = Sentencize.split_sentence("This is an example.", prefix_file="my_prefixes.txt", lang=missing)
+sen = Sentencize.split_sentence("This is an example.", prefix_file="my_prefixes.txt", lang=nothing)
 ```
 
 Or even pass the prefixes as a dictionary:

diff --git a/docs/make.jl b/docs/make.jl
@@ -13,7 +13,8 @@ makedocs(;
         assets = String[]
     ),
     pages = [
-        "Home" => "index.md"
+        "Home" => "index.md",
+        "API Reference" => "api_reference.md"
     ]
 )
 

diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md
@@ -0,0 +1,6 @@
+```@index
+```
+
+```@autodocs
+Modules = [Sentencize]
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,9 +6,70 @@ CurrentModule = Sentencize
 
 Documentation for [Sentencize](https://github.com/astariul/Sentencize.jl).
 
-```@index
+**Text to sentence splitter using a heuristic algorithm.**
+
+This module is a port of the [Python package `sentence-splitter`](https://github.com/berkmancenter/mediacloud-sentence-splitter).
+
+The module allows the splitting of text paragraphs into sentences. It is based on scripts developed by Philipp Koehn and Josh Schroeder for processing the [Europarl corpus](http://www.statmt.org/europarl/).
+
+## Usage
+
+The module uses punctuation and capitalization clues to split plain text into a list of sentences :
+
+```julia
+import Sentencize
+
+sen = Sentencize.split_sentence("This is a paragraph. It contains several sentences. \"But why,\" you ask?")
+println(sen)
+# ["This is a paragraph.", "It contains several sentences.", "\"But why,\" you ask?"]
+```
+
+You can specify another language than English:
+
+```julia
+sen = Sentencize.split_sentence("Brookfield Office Properties Inc. (« BOPI »), dont les actifs liés aux immeubles directement...", lang="fr")
+println(sen)
+# ["Brookfield Office Properties Inc. (« BOPI »), dont les actifs liés aux immeubles directement..."]
+```
+
+You can specify your own non-breaking prefixes file:
+
+```julia
+sen = Sentencize.split_sentence("This is an example.", prefix_file="my_prefixes.txt", lang=nothing)
 ```
 
-```@autodocs
-Modules = [Sentencize]
+Or even pass the prefixes as a dictionary:
+
+```julia
+sen = Sentencize.split_sentence("This is another example. Another sentence.", prefixes=Dict("example" => Sentencize.default))
+# ["This is another example. Another sentence."]
 ```
+
+## Languages
+
+Currently supported languages are :
+
+- Catalan (`ca`)
+- Czech (`cs`)
+- Danish (`da`)
+- Dutch (`nl`)
+- English (`en`)
+- Finnish (`fi`)
+- French (`fr`)
+- German (`de`)
+- Greek (`el`)
+- Hungarian (`hu`)
+- Icelandic (`is`)
+- Italian (`it`)
+- Latvian (`lv`)
+- Lithuanian (`lt`)
+- Norwegian (Bokmål) (`no`)
+- Polish (`pl`)
+- Portuguese (`pt`)
+- Romanian (`ro`)
+- Russian (`ru`)
+- Slovak (`sk`)
+- Slovene (`sl`)
+- Spanish (`es`)
+- Swedish (`sv`)
+- Turkish (`tr`)
diff --git a/src/Sentencize.jl b/src/Sentencize.jl
@@ -5,59 +5,106 @@ export PrefixType
 export Prefixes
 export split_sentence
 
+"""
+    SUPPORTED_LANG
+
+Supported languages.
+
+Contains: 
+- "ca"
+- "cs"
+- "da"
+- "de"
+- "el"
+- "en"
+- "es"
+- "fi"
+- "fr"
+- "hu"
+- "is"
+- "it"
+- "lt"
+- "lv"
+- "nl"
+- "no"
+- "pl"
+- "pt"
+- "ro"
+- "ru"
+- "sk"
+- "sl"
+- "sv"
+- "tr"
+"""
 const SUPPORTED_LANG = ["ca", "cs", "da", "de", "el", "en", "es", "fi", "fr",
-                        "hu", "is", "it", "lt", "lv", "nl", "no", "pl", "pt",
-                        "ro", "ru", "sk", "sl", "sv", "tr"]
+    "hu", "is", "it", "lt", "lv", "nl", "no", "pl", "pt",
+    "ro", "ru", "sk", "sl", "sv", "tr"]
 
-@enum PrefixType default numeric_only
 
-struct Prefixes
-    non_breaking_prefixes::Dict{String, PrefixType}
-
-    function Prefixes(prefixes::Dict{String, PrefixType}=Dict{String, PrefixType}(); prefix_file=missing, lang="en")
-        function _load_prefix_file(pfile)
-            nb_prefixes = Dict{String, PrefixType}()
-            open(pfile) do file
-                for line in eachline(file)
-                    if occursin("#NUMERIC_ONLY#", line)
-                        prefix_type = numeric_only
-                    else
-                        prefix_type = default
-                    end
+function _load_prefix_file(T::Type{<:AbstractString}, pfile)
+    nb_prefixes = Dict{T,PrefixType}()
+    open(pfile) do file
+        for line in eachline(file)
+            if occursin("#NUMERIC_ONLY#", line)
+                prefix_type = numeric_only
+            else
+                prefix_type = default
+            end
 
-                    line = replace(line, r"#.*" => "")     # Remove comments
-                    line = strip(line)
-
-                    if isempty(line)
-                        continue
-                    end
-
-                    nb_prefixes[line] = prefix_type
-                end
+            line = replace(line, r"#.*" => "")     # Remove comments
+            line = strip(line)
+
+            if isempty(line)
+                continue
             end
-            return nb_prefixes
+
+            nb_prefixes[line] = prefix_type
         end
-
-        if !ismissing(lang)
+    end
+    return nb_prefixes
+end
+
+
+@enum PrefixType default numeric_only
+
+"""
+Prefixes(prefixes::Dict{T,PrefixType}=Dict{String,PrefixType}(); prefix_file::Union{String,Nothing}=nothing, lang::Union{String,Nothing}="en") where {T<:AbstractString}
+
+Constructs `Prefixes`.
+
+# Arguments
+- `prefixes::Dict{<:AbstractString,PrefixType}=Dict{T,PrefixType}()`: Optional. A dictionary of non-breaking prefixes.
+- `prefix_file::Union{String,Nothing}=nothing`: Optional. A path to a file containing non-breaking prefixes to add to provided `prefixes`.
+- `lang::AbstractString="en"`: Optional. The language of the non-breaking prefixes (see `?SUPPORTED_LANG` for available languages) to be added to `prefixes`.
+"""
+struct Prefixes{T<:AbstractString}
+    non_breaking_prefixes::Dict{T,PrefixType}
+
+    function Prefixes(prefixes::Dict{T,PrefixType}=Dict{String,PrefixType}(); prefix_file::Union{String,Nothing}=nothing, lang::Union{String,Nothing}="en") where {T<:AbstractString}
+
+        if !isnothing(lang)
             if !(lang in SUPPORTED_LANG)
-                throw(ArgumentError("Unsupported language. Use a supported language ($SUPPORTED_LANG). " * 
+                throw(ArgumentError("Unsupported language. Use a supported language ($SUPPORTED_LANG). " *
                                     "You can also provide your own non_breaking_prefixes file with the " *
                                     "keyword argument `prefix_file`."))
             else
-                merge!(prefixes, _load_prefix_file(joinpath(@__DIR__, "non_breaking_prefixes/$lang.txt")))
+                merge!(prefixes, _load_prefix_file(T, joinpath(@__DIR__, "non_breaking_prefixes/$lang.txt")))
             end
         end
 
-        if !ismissing(prefix_file)
-            merge!(prefixes, _load_prefix_file(prefix_file))
+        if !isnothing(prefix_file)
+            if isfile(prefix_file)
+                merge!(prefixes, _load_prefix_file(T, prefix_file))
+            else
+                throw(ArgumentError("File $prefix_file does not exist."))
+            end
         end
 
-        new(prefixes)
+        new{T}(prefixes)
     end
 end
 
-
-function _basic_sentence_breaks(text::String)
+function _basic_sentence_breaks(text::AbstractString)
     # Non-period end of sentence markers (?!) followed by sentence starters
     text = replace(text, r"([?!]) +(['\"([\u00bf\u00A1\p{Pi}]*[\p{Lu}\p{Lo}])" => s"\1\n\2")
 
@@ -76,12 +123,12 @@ function _basic_sentence_breaks(text::String)
 end
 
 
-function _is_prefix_honorific(prefix::SubString{String}, starting_punct::SubString{String}, non_breaking_prefixes::Dict{String,PrefixType})
+function _is_prefix_honorific(prefix::AbstractString, starting_punct::AbstractString, non_breaking_prefixes::Dict{<:AbstractString,PrefixType})
     # Check if \\1 is a known honorific and \\2 is empty.
-    if prefix != ""
+    if !isempty(prefix)
         if prefix in keys(non_breaking_prefixes)
             if non_breaking_prefixes[prefix] == default
-                if starting_punct == ""
+                if isempty(starting_punct)
                     return true
                 end
             end
@@ -91,13 +138,13 @@ function _is_prefix_honorific(prefix::SubString{String}, starting_punct::SubStri
 end
 
 
-function _is_numeric(prefix::SubString{String}, starting_punct::SubString{String}, next_word::SubString{String}, non_breaking_prefixes::Dict{String,PrefixType})
+function _is_numeric(prefix::AbstractString, starting_punct::AbstractString, next_word::AbstractString, non_breaking_prefixes::Dict{<:AbstractString,PrefixType})
     # The next word has a bunch of initial quotes, maybe a space, then either upper case or a number.
-    if prefix != ""
+    if !isempty(prefix)
         if prefix in keys(non_breaking_prefixes)
             if non_breaking_prefixes[prefix] == numeric_only
-                if starting_punct == ""
-                    if match(r"^[0-9]+", next_word) != nothing
+                if isempty(starting_punct)
+                    if !isnothing(match(r"^[0-9]+", next_word))
                         return true
                     end
                 end
@@ -108,7 +155,24 @@ function _is_numeric(prefix::SubString{String}, starting_punct::SubString{String
 end
 
 
-function split_sentence(text::String; prefixes::Dict{String, PrefixType}=Dict{String, PrefixType}(), prefix_file=missing, lang="en")
+"""
+    split_sentence(text::AbstractString; prefixes::Dict{<:AbstractString,PrefixType}=Dict{String,PrefixType}(), prefix_file::Union{String,Nothing}=nothing, lang::Union{String,Nothing}="en")
+
+Splits a `text` into sentences.
+
+# Arguments
+- `text::AbstractString`: The text to split into sentences.
+- `prefixes::Dict{<:AbstractString,PrefixType}`: Optional. A dictionary of non-breaking prefixes.
+- `prefix_file::Union{String,Nothing}`: Optional. A path to a file containing non-breaking prefixes to add to provided `prefixes`.
+- `lang::Union{String,Nothing}`: Optional. The language of the non-breaking prefixes (see `?SUPPORTED_LANG` for available languages) to be added to `prefixes` Default is "en" (=English).
+
+# Examples
+```julia
+split_sentence("This is a paragraph. It contains several sentences. \"But why,\" you ask?")
+# Output: ["This is a paragraph.", "It contains several sentences.", "\"But why,\" you ask?"]
+```
+"""
+function split_sentence(text::AbstractString; prefixes::Dict{<:AbstractString,PrefixType}=Dict{String,PrefixType}(), prefix_file::Union{String,Nothing}=nothing, lang::Union{String,Nothing}="en")
     if text == ""
         return []
     end
@@ -120,19 +184,19 @@ function split_sentence(text::String; prefixes::Dict{String, PrefixType}=Dict{St
     # Special punctuation cases : check all remaining periods
     words = split(text, r" +")
     text = ""
-    for i in 1:length(words) - 1
+    for i in 1:length(words)-1
         m = match(r"([\w\.\-]*)(['\"\)\]\%\p{Pf}]*)(\.+)$", words[i])
 
-        if m != nothing
+        if !isnothing(m)
             prefix = m.captures[1]
             starting_punct = m.captures[2]
 
             if _is_prefix_honorific(prefix, starting_punct, pf.non_breaking_prefixes)
                 # Not breaking
-            elseif match(r"(\.)[\p{Lu}\p{Lo}\-]+(\.+)$", words[i]) != nothing
+            elseif !isnothing(match(r"(\.)[\p{Lu}\p{Lo}\-]+(\.+)$", words[i]))
                 # Not breaking - upper case acronym
-            elseif match(r"^([ ]*['\"([\u00bf\u00A1\p{Pi}]*[ ]*[\p{Lu}\p{Lo}0-9])", words[i + 1]) != nothing
-                if !_is_numeric(prefix, starting_punct, words[i + 1], pf.non_breaking_prefixes)
+            elseif !isnothing(match(r"^([ ]*['\"([\u00bf\u00A1\p{Pi}]*[ ]*[\p{Lu}\p{Lo}0-9])", words[i+1]))
+                if !_is_numeric(prefix, starting_punct, words[i+1], pf.non_breaking_prefixes)
                     words[i] = words[i] * "\n"
                     # We always add a return for these unless we have a numeric non-breaker and a number start
                 end

diff --git a/test/Sentencize.jl b/test/Sentencize.jl
@@ -10,15 +10,15 @@
     @test "test-prefix" in keys(ss.non_breaking_prefixes) &&
           "Apr" in keys(ss.non_breaking_prefixes)        # English is also loaded
 
-    ss = Prefixes(Dict("test-prefix" => Sentencize.default), lang = missing)
+    ss = Prefixes(Dict("test-prefix" => Sentencize.default), lang = nothing)
     @test "test-prefix" in keys(ss.non_breaking_prefixes) &&
           !("Apr" in keys(ss.non_breaking_prefixes))
 
     ss = Prefixes(prefix_file = "test.txt")
     @test "another-test-prefix" in keys(ss.non_breaking_prefixes) &&
           "Apr" in keys(ss.non_breaking_prefixes)
 
-    ss = Prefixes(prefix_file = "test.txt", lang = missing)
+    ss = Prefixes(prefix_file = "test.txt", lang = nothing)
     @test "another-test-prefix" in keys(ss.non_breaking_prefixes) &&
           !("Apr" in keys(ss.non_breaking_prefixes))
 
@@ -28,7 +28,7 @@
           "test-prefix" in keys(ss.non_breaking_prefixes)
 
     ss = Prefixes(
-        Dict("test-prefix" => Sentencize.default), prefix_file = "test.txt", lang = missing)
+        Dict("test-prefix" => Sentencize.default), prefix_file = "test.txt", lang = nothing)
     @test "another-test-prefix" in keys(ss.non_breaking_prefixes) &&
           !("Apr" in keys(ss.non_breaking_prefixes)) &&
           "test-prefix" in keys(ss.non_breaking_prefixes)
@@ -40,6 +40,11 @@
           "test-prefix" in keys(ss.non_breaking_prefixes)
 
     @test_throws ArgumentError Prefixes(lang = "some-weird-language")
+
+    ## Test non-string prefix 
+    ss = Prefixes(Dict(strip(" test-prefix") => Sentencize.default), lang = nothing)
+    @test "test-prefix" in keys(ss.non_breaking_prefixes) &&
+          !("Apr" in keys(ss.non_breaking_prefixes))
 end
 
 @testset "split_sentence" begin
@@ -106,4 +111,10 @@ end
         prefixes = Dict("Prefix1" => Sentencize.default, "#Hello" => Sentencize.default,
             "Prefix2" => Sentencize.default)) ==
           ["Hello.", "Prefix1. Prefix2. Hello again.", "Good bye."]
+
+    ## Different string types
+    @test split_sentence(strip(" Hello. Prefix1. Prefix2. Hello again. Good bye. "),
+        prefixes = Dict("Prefix1" => Sentencize.default, "#Hello" => Sentencize.default,
+            "Prefix2" => Sentencize.default)) ==
+          ["Hello.", "Prefix1. Prefix2. Hello again.", "Good bye."]
 end