Merge pull request #19 from JuliaAI/dev

pazzo83 · web-flow · commit 2af3837f5588 · 2022-01-18T16:46:13.000-05:00
For a 0.1.3 release
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.1.2"
+version = "0.1.3"
 
 [deps]
 CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
diff --git a/src/abstract_text_transformer.jl b/src/abstract_text_transformer.jl
@@ -34,17 +34,17 @@ function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
     if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
         high = round(Int, transformer.max_doc_freq * n)
         low = round(Int, transformer.min_doc_freq * n)
-        new_dtm, vocab = limit_features(dtm_matrix, high, low)
+        new_doc_term_mat, vocab = limit_features(dtm_matrix, high, low)
     else
-        new_dtm = dtm_matrix.dtm
+        new_doc_term_mat = dtm_matrix.dtm
         vocab = dtm_matrix.terms
     end
 
     # calculate IDF
-    idf = compute_idf(transformer.smooth_idf, new_dtm)
+    idf = compute_idf(transformer.smooth_idf, new_doc_term_mat)
 
     # prepare result
-    fitresult = get_result(transformer, idf, vocab)
+    fitresult = get_result(transformer, idf, vocab, new_doc_term_mat)
     cache = nothing
 
     return fitresult, cache, NamedTuple()
diff --git a/src/bagofwords_transformer.jl b/src/bagofwords_transformer.jl
@@ -2,7 +2,6 @@
     BagOfWordsTransformer()
 
 Convert a collection of raw documents to matrix representing a bag-of-words structure.
-
 Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
 a count of every word in the document corpus/collection for every document. This is a simple
 but often quite powerful way of representing documents as vectors. The resulting representation is
@@ -12,7 +11,6 @@ document.
 
 Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
 to words occuring in a maximum or minimum portion of documents.
-
 The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
 that the transformer will consider. `max_doc_freq` indicates that terms in only
 up to the specified percentage of documents will be considered. For example, if
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
@@ -57,16 +57,22 @@ end
 struct BMI25TransformerResult
     vocab::Vector{String}
     idf_vector::Vector{Float64}
+    mean_words_in_docs::Float64
 end
 
-get_result(::BM25Transformer, idf::Vector{Float64}, vocab::Vector{String}) = BMI25TransformerResult(vocab, idf)
+function get_result(::BM25Transformer, idf::Vector{F}, vocab::Vector{String}, doc_term_mat::SparseMatrixCSC) where {F <: AbstractFloat}
+    words_in_documents = F.(sum(doc_term_mat; dims=1))
+    mean_words_in_docs = mean(words_in_documents)
+    BMI25TransformerResult(vocab, idf, mean_words_in_docs)
+end
 
 # BM25: Okapi Best Match 25
 # Details at: https://en.wikipedia.org/wiki/Okapi_BM25
 # derived from https://github.com/zgornel/StringAnalysis.jl/blob/master/src/stats.jl
 function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
                     bm25::SparseMatrixCSC{F},
-                    idf_vector::Vector{F};
+                    idf_vector::Vector{F},
+                    mean_words_in_docs::Float64;
                     κ::Int=2,
                     β::Float64=0.75) where {T <: Real, F <: AbstractFloat}
     @assert size(doc_term_mat) == size(bm25)
@@ -82,7 +88,7 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
 
     # TF tells us what proportion of a document is defined by a term
     words_in_documents = F.(sum(doc_term_mat; dims=1))
-    ln = words_in_documents ./ mean(words_in_documents)
+    ln = words_in_documents ./ mean_words_in_docs
     oneval = one(F)
 
     for i = 1:n
@@ -100,9 +106,9 @@ end
 function _transform(transformer::BM25Transformer, 
                     result::BMI25TransformerResult,
                     v::Corpus)
-    dtm_matrix = build_dtm(v, result.vocab)
-    bm25 = similar(dtm_matrix.dtm, eltype(result.idf_vector))
-    build_bm25!(dtm_matrix.dtm, bm25, result.idf_vector; κ=transformer.κ, β=transformer.β)
+    doc_terms = build_dtm(v, result.vocab)
+    bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
+    build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
 
     # here we return the `adjoint` of our sparse matrix to conform to 
     # the `n x p` dimensions throughout MLJ
@@ -113,7 +119,8 @@ end
 function MMI.fitted_params(::BM25Transformer, fitresult)
     vocab = fitresult.vocab
     idf_vector = fitresult.idf_vector
-    return (vocab = vocab, idf_vector = idf_vector)
+    mean_words_in_docs = fitresult.mean_words_in_docs
+    return (vocab = vocab, idf_vector = idf_vector, mean_words_in_docs = mean_words_in_docs)
 end
 
 
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
@@ -60,7 +60,8 @@ struct TfidfTransformerResult
     idf_vector::Vector{Float64}
 end
 
-get_result(::TfidfTransformer, idf::Vector{Float64}, vocab::Vector{String}) = TfidfTransformerResult(vocab, idf)
+get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) = 
+    TfidfTransformerResult(vocab, idf)
 
 function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
                       tfidf::SparseMatrixCSC{F},
@@ -89,9 +90,9 @@ end
 function _transform(::TfidfTransformer, 
                     result::TfidfTransformerResult,
                     v::Corpus)
-    dtm_matrix = build_dtm(v, result.vocab)
-    tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector))
-    build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector)
+    doc_terms = build_dtm(v, result.vocab)
+    tfidf = similar(doc_terms.dtm, eltype(result.idf_vector))
+    build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector)
 
     # here we return the `adjoint` of our sparse matrix to conform to 
     # the `n x p` dimensions throughout MLJ
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,7 +1,7 @@
-function limit_features(doc_term_matrix::DocumentTermMatrix,
+function limit_features(doc_terms::DocumentTermMatrix,
                         high::Int,
                         low::Int)
-    doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2))
+    doc_freqs = vec(sum(doc_terms.dtm, dims=2))
 
     # build mask to restrict terms
     mask = trues(length(doc_freqs))
@@ -12,9 +12,9 @@ function limit_features(doc_term_matrix::DocumentTermMatrix,
         mask .&= (doc_freqs .>= low)
     end
 
-    new_terms = doc_term_matrix.terms[mask]
+    new_terms = doc_terms.terms[mask]
 
-    return (doc_term_matrix.dtm[mask, :], new_terms)
+    return (doc_terms.dtm[mask, :], new_terms)
 end
 
 ## Helper functions to build Corpus ##
@@ -55,11 +55,11 @@ function build_dtm(docs::Corpus, terms::Vector{T}) where {T}
         end
     end
     if length(rows) > 0
-        doc_term_matrix = sparse(rows, columns, values, m, n)
+        doc_term_mat = sparse(rows, columns, values, m, n)
     else
-        doc_term_matrix = spzeros(Int, m, n)
+        doc_term_mat = spzeros(Int, m, n)
     end
-    DocumentTermMatrix(doc_term_matrix, terms, row_indices)
+    DocumentTermMatrix(doc_term_mat, terms, row_indices)
 end
 
 ## General method to calculate IDF vector ##
diff --git a/test/abstract_text_transformer.jl b/test/abstract_text_transformer.jl
@@ -60,6 +60,16 @@ using TextAnalysis
         @test sum(test_doc_transform, dims=2)[2] > 0.0
         @test size(test_doc_transform) == (2, 11)
     end
+
+    # test proper fit:
+    # here we are testing to make sure the size of the corpus to be
+    # transformed does not alter the transformation that the model
+    # is doing.
+    for mach = test_machines
+        single_doc_transform = transform(mach, [test_doc2])
+        multiple_doc_transform = transform(mach, [test_doc2, test_doc2])
+        @test single_doc_transform[1, :] == multiple_doc_transform[1, :]
+    end
 end
 
 @testset "bag of words use" begin