Skip to content

Commit 2af3837

Browse files
authored
Merge pull request #19 from JuliaAI/dev
For a 0.1.3 release
2 parents 36422b0 + 2b6cb31 commit 2af3837

7 files changed

+41
-25
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "MLJText"
22
uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
33
authors = ["Chris Alexander <[email protected]>, Anthony D. Blaom <[email protected]>"]
4-
version = "0.1.2"
4+
version = "0.1.3"
55

66
[deps]
77
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"

src/abstract_text_transformer.jl

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,17 @@ function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
3434
if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
3535
high = round(Int, transformer.max_doc_freq * n)
3636
low = round(Int, transformer.min_doc_freq * n)
37-
new_dtm, vocab = limit_features(dtm_matrix, high, low)
37+
new_doc_term_mat, vocab = limit_features(dtm_matrix, high, low)
3838
else
39-
new_dtm = dtm_matrix.dtm
39+
new_doc_term_mat = dtm_matrix.dtm
4040
vocab = dtm_matrix.terms
4141
end
4242

4343
# calculate IDF
44-
idf = compute_idf(transformer.smooth_idf, new_dtm)
44+
idf = compute_idf(transformer.smooth_idf, new_doc_term_mat)
4545

4646
# prepare result
47-
fitresult = get_result(transformer, idf, vocab)
47+
fitresult = get_result(transformer, idf, vocab, new_doc_term_mat)
4848
cache = nothing
4949

5050
return fitresult, cache, NamedTuple()

src/bagofwords_transformer.jl

-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
BagOfWordsTransformer()
33
44
Convert a collection of raw documents to matrix representing a bag-of-words structure.
5-
65
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
76
a count of every word in the document corpus/collection for every document. This is a simple
87
but often quite powerful way of representing documents as vectors. The resulting representation is
@@ -12,7 +11,6 @@ document.
1211
1312
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
1413
to words occuring in a maximum or minimum portion of documents.
15-
1614
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
1715
that the transformer will consider. `max_doc_freq` indicates that terms in only
1816
up to the specified percentage of documents will be considered. For example, if

src/bm25_transformer.jl

+14-7
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,22 @@ end
5757
struct BMI25TransformerResult
5858
vocab::Vector{String}
5959
idf_vector::Vector{Float64}
60+
mean_words_in_docs::Float64
6061
end
6162

62-
get_result(::BM25Transformer, idf::Vector{Float64}, vocab::Vector{String}) = BMI25TransformerResult(vocab, idf)
63+
function get_result(::BM25Transformer, idf::Vector{F}, vocab::Vector{String}, doc_term_mat::SparseMatrixCSC) where {F <: AbstractFloat}
64+
words_in_documents = F.(sum(doc_term_mat; dims=1))
65+
mean_words_in_docs = mean(words_in_documents)
66+
BMI25TransformerResult(vocab, idf, mean_words_in_docs)
67+
end
6368

6469
# BM25: Okapi Best Match 25
6570
# Details at: https://en.wikipedia.org/wiki/Okapi_BM25
6671
# derived from https://github.com/zgornel/StringAnalysis.jl/blob/master/src/stats.jl
6772
function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
6873
bm25::SparseMatrixCSC{F},
69-
idf_vector::Vector{F};
74+
idf_vector::Vector{F},
75+
mean_words_in_docs::Float64;
7076
κ::Int=2,
7177
β::Float64=0.75) where {T <: Real, F <: AbstractFloat}
7278
@assert size(doc_term_mat) == size(bm25)
@@ -82,7 +88,7 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
8288

8389
# TF tells us what proportion of a document is defined by a term
8490
words_in_documents = F.(sum(doc_term_mat; dims=1))
85-
ln = words_in_documents ./ mean(words_in_documents)
91+
ln = words_in_documents ./ mean_words_in_docs
8692
oneval = one(F)
8793

8894
for i = 1:n
@@ -100,9 +106,9 @@ end
100106
function _transform(transformer::BM25Transformer,
101107
result::BMI25TransformerResult,
102108
v::Corpus)
103-
dtm_matrix = build_dtm(v, result.vocab)
104-
bm25 = similar(dtm_matrix.dtm, eltype(result.idf_vector))
105-
build_bm25!(dtm_matrix.dtm, bm25, result.idf_vector; κ=transformer.κ, β=transformer.β)
109+
doc_terms = build_dtm(v, result.vocab)
110+
bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
111+
build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
106112

107113
# here we return the `adjoint` of our sparse matrix to conform to
108114
# the `n x p` dimensions throughout MLJ
@@ -113,7 +119,8 @@ end
113119
function MMI.fitted_params(::BM25Transformer, fitresult)
114120
vocab = fitresult.vocab
115121
idf_vector = fitresult.idf_vector
116-
return (vocab = vocab, idf_vector = idf_vector)
122+
mean_words_in_docs = fitresult.mean_words_in_docs
123+
return (vocab = vocab, idf_vector = idf_vector, mean_words_in_docs = mean_words_in_docs)
117124
end
118125

119126

src/tfidf_transformer.jl

+5-4
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ struct TfidfTransformerResult
6060
idf_vector::Vector{Float64}
6161
end
6262

63-
get_result(::TfidfTransformer, idf::Vector{Float64}, vocab::Vector{String}) = TfidfTransformerResult(vocab, idf)
63+
get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) =
64+
TfidfTransformerResult(vocab, idf)
6465

6566
function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
6667
tfidf::SparseMatrixCSC{F},
@@ -89,9 +90,9 @@ end
8990
function _transform(::TfidfTransformer,
9091
result::TfidfTransformerResult,
9192
v::Corpus)
92-
dtm_matrix = build_dtm(v, result.vocab)
93-
tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector))
94-
build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector)
93+
doc_terms = build_dtm(v, result.vocab)
94+
tfidf = similar(doc_terms.dtm, eltype(result.idf_vector))
95+
build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector)
9596

9697
# here we return the `adjoint` of our sparse matrix to conform to
9798
# the `n x p` dimensions throughout MLJ

src/utils.jl

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
function limit_features(doc_term_matrix::DocumentTermMatrix,
1+
function limit_features(doc_terms::DocumentTermMatrix,
22
high::Int,
33
low::Int)
4-
doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2))
4+
doc_freqs = vec(sum(doc_terms.dtm, dims=2))
55

66
# build mask to restrict terms
77
mask = trues(length(doc_freqs))
@@ -12,9 +12,9 @@ function limit_features(doc_term_matrix::DocumentTermMatrix,
1212
mask .&= (doc_freqs .>= low)
1313
end
1414

15-
new_terms = doc_term_matrix.terms[mask]
15+
new_terms = doc_terms.terms[mask]
1616

17-
return (doc_term_matrix.dtm[mask, :], new_terms)
17+
return (doc_terms.dtm[mask, :], new_terms)
1818
end
1919

2020
## Helper functions to build Corpus ##
@@ -55,11 +55,11 @@ function build_dtm(docs::Corpus, terms::Vector{T}) where {T}
5555
end
5656
end
5757
if length(rows) > 0
58-
doc_term_matrix = sparse(rows, columns, values, m, n)
58+
doc_term_mat = sparse(rows, columns, values, m, n)
5959
else
60-
doc_term_matrix = spzeros(Int, m, n)
60+
doc_term_mat = spzeros(Int, m, n)
6161
end
62-
DocumentTermMatrix(doc_term_matrix, terms, row_indices)
62+
DocumentTermMatrix(doc_term_mat, terms, row_indices)
6363
end
6464

6565
## General method to calculate IDF vector ##

test/abstract_text_transformer.jl

+10
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,16 @@ using TextAnalysis
6060
@test sum(test_doc_transform, dims=2)[2] > 0.0
6161
@test size(test_doc_transform) == (2, 11)
6262
end
63+
64+
# test proper fit:
65+
# here we are testing to make sure the size of the corpus to be
66+
# transformed does not alter the transformation that the model
67+
# is doing.
68+
for mach = test_machines
69+
single_doc_transform = transform(mach, [test_doc2])
70+
multiple_doc_transform = transform(mach, [test_doc2, test_doc2])
71+
@test single_doc_transform[1, :] == multiple_doc_transform[1, :]
72+
end
6373
end
6474

6575
@testset "bag of words use" begin

0 commit comments

Comments
 (0)