Skip to content

Commit

Permalink
Merge pull request #19 from JuliaAI/dev
Browse files Browse the repository at this point in the history
For a 0.1.3 release
  • Loading branch information
pazzo83 authored Jan 18, 2022
2 parents 36422b0 + 2b6cb31 commit 2af3837
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 25 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJText"
uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
authors = ["Chris Alexander <[email protected]>, Anthony D. Blaom <[email protected]>"]
version = "0.1.2"
version = "0.1.3"

[deps]
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
Expand Down
8 changes: 4 additions & 4 deletions src/abstract_text_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
high = round(Int, transformer.max_doc_freq * n)
low = round(Int, transformer.min_doc_freq * n)
new_dtm, vocab = limit_features(dtm_matrix, high, low)
new_doc_term_mat, vocab = limit_features(dtm_matrix, high, low)
else
new_dtm = dtm_matrix.dtm
new_doc_term_mat = dtm_matrix.dtm
vocab = dtm_matrix.terms
end

# calculate IDF
idf = compute_idf(transformer.smooth_idf, new_dtm)
idf = compute_idf(transformer.smooth_idf, new_doc_term_mat)

# prepare result
fitresult = get_result(transformer, idf, vocab)
fitresult = get_result(transformer, idf, vocab, new_doc_term_mat)
cache = nothing

return fitresult, cache, NamedTuple()
Expand Down
2 changes: 0 additions & 2 deletions src/bagofwords_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
BagOfWordsTransformer()
Convert a collection of raw documents to matrix representing a bag-of-words structure.
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
a count of every word in the document corpus/collection for every document. This is a simple
but often quite powerful way of representing documents as vectors. The resulting representation is
Expand All @@ -12,7 +11,6 @@ document.
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
to words occuring in a maximum or minimum portion of documents.
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
that the transformer will consider. `max_doc_freq` indicates that terms in only
up to the specified percentage of documents will be considered. For example, if
Expand Down
21 changes: 14 additions & 7 deletions src/bm25_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,22 @@ end
struct BMI25TransformerResult
vocab::Vector{String}
idf_vector::Vector{Float64}
mean_words_in_docs::Float64
end

get_result(::BM25Transformer, idf::Vector{Float64}, vocab::Vector{String}) = BMI25TransformerResult(vocab, idf)
function get_result(::BM25Transformer, idf::Vector{F}, vocab::Vector{String}, doc_term_mat::SparseMatrixCSC) where {F <: AbstractFloat}
words_in_documents = F.(sum(doc_term_mat; dims=1))
mean_words_in_docs = mean(words_in_documents)
BMI25TransformerResult(vocab, idf, mean_words_in_docs)
end

# BM25: Okapi Best Match 25
# Details at: https://en.wikipedia.org/wiki/Okapi_BM25
# derived from https://github.com/zgornel/StringAnalysis.jl/blob/master/src/stats.jl
function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
bm25::SparseMatrixCSC{F},
idf_vector::Vector{F};
idf_vector::Vector{F},
mean_words_in_docs::Float64;
κ::Int=2,
β::Float64=0.75) where {T <: Real, F <: AbstractFloat}
@assert size(doc_term_mat) == size(bm25)
Expand All @@ -82,7 +88,7 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},

# TF tells us what proportion of a document is defined by a term
words_in_documents = F.(sum(doc_term_mat; dims=1))
ln = words_in_documents ./ mean(words_in_documents)
ln = words_in_documents ./ mean_words_in_docs
oneval = one(F)

for i = 1:n
Expand All @@ -100,9 +106,9 @@ end
function _transform(transformer::BM25Transformer,
result::BMI25TransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)
bm25 = similar(dtm_matrix.dtm, eltype(result.idf_vector))
build_bm25!(dtm_matrix.dtm, bm25, result.idf_vector; κ=transformer.κ, β=transformer.β)
doc_terms = build_dtm(v, result.vocab)
bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)

# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
Expand All @@ -113,7 +119,8 @@ end
function MMI.fitted_params(::BM25Transformer, fitresult)
vocab = fitresult.vocab
idf_vector = fitresult.idf_vector
return (vocab = vocab, idf_vector = idf_vector)
mean_words_in_docs = fitresult.mean_words_in_docs
return (vocab = vocab, idf_vector = idf_vector, mean_words_in_docs = mean_words_in_docs)
end


Expand Down
9 changes: 5 additions & 4 deletions src/tfidf_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ struct TfidfTransformerResult
idf_vector::Vector{Float64}
end

get_result(::TfidfTransformer, idf::Vector{Float64}, vocab::Vector{String}) = TfidfTransformerResult(vocab, idf)
get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) =
TfidfTransformerResult(vocab, idf)

function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
tfidf::SparseMatrixCSC{F},
Expand Down Expand Up @@ -89,9 +90,9 @@ end
function _transform(::TfidfTransformer,
result::TfidfTransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)
tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector))
build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector)
doc_terms = build_dtm(v, result.vocab)
tfidf = similar(doc_terms.dtm, eltype(result.idf_vector))
build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector)

# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
Expand Down
14 changes: 7 additions & 7 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
function limit_features(doc_term_matrix::DocumentTermMatrix,
function limit_features(doc_terms::DocumentTermMatrix,
high::Int,
low::Int)
doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2))
doc_freqs = vec(sum(doc_terms.dtm, dims=2))

# build mask to restrict terms
mask = trues(length(doc_freqs))
Expand All @@ -12,9 +12,9 @@ function limit_features(doc_term_matrix::DocumentTermMatrix,
mask .&= (doc_freqs .>= low)
end

new_terms = doc_term_matrix.terms[mask]
new_terms = doc_terms.terms[mask]

return (doc_term_matrix.dtm[mask, :], new_terms)
return (doc_terms.dtm[mask, :], new_terms)
end

## Helper functions to build Corpus ##
Expand Down Expand Up @@ -55,11 +55,11 @@ function build_dtm(docs::Corpus, terms::Vector{T}) where {T}
end
end
if length(rows) > 0
doc_term_matrix = sparse(rows, columns, values, m, n)
doc_term_mat = sparse(rows, columns, values, m, n)
else
doc_term_matrix = spzeros(Int, m, n)
doc_term_mat = spzeros(Int, m, n)
end
DocumentTermMatrix(doc_term_matrix, terms, row_indices)
DocumentTermMatrix(doc_term_mat, terms, row_indices)
end

## General method to calculate IDF vector ##
Expand Down
10 changes: 10 additions & 0 deletions test/abstract_text_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ using TextAnalysis
@test sum(test_doc_transform, dims=2)[2] > 0.0
@test size(test_doc_transform) == (2, 11)
end

# test proper fit:
# here we are testing to make sure the size of the corpus to be
# transformed does not alter the transformation that the model
# is doing.
for mach = test_machines
single_doc_transform = transform(mach, [test_doc2])
multiple_doc_transform = transform(mach, [test_doc2, test_doc2])
@test single_doc_transform[1, :] == multiple_doc_transform[1, :]
end
end

@testset "bag of words use" begin
Expand Down

0 comments on commit 2af3837

Please sign in to comment.