Merge pull request #28 from JuliaAI/dev

ablaom · web-flow · commit 3505afe2b757 · 2024-01-10T08:39:11.000+13:00
For a 0.2.2 release
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -0,0 +1,5 @@
+coverage:
+  status:
+    project:
+      default:
+        threshold: 0.5%
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3'
+          - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest
@@ -44,6 +44,7 @@ jobs:
         env:
           JULIA_NUM_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         with:
-          file: lcov.info
+          files: lcov.info
+
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJText"
 uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
 authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
@@ -18,7 +18,7 @@ MLJModelInterface = "1.4"
 ScientificTypes = "2.2.2, 3"
 ScientificTypesBase = "2.2.0, 3"
 TextAnalysis = "0.7.3"
-julia = "1.3"
+julia = "1.6"
 
 [extras]
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
@@ -119,7 +119,7 @@ The transformer converts a collection of documents, tokenized or pre-parsed as b
 words/ngrams, to a matrix of [Okapi BM25 document-word
 statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
 term frequency (TF) and inverse document frequency (IDF, defined below), as in
-[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
+[`TfidfTransformer`](@ref), but additionally adjusts for the probability that a user will
 consider a search result relevant based, on the terms in the search query and those in
 each document.
 
@@ -137,21 +137,21 @@ In MLJ or MLJBase, bind an instance `model` to data with
 
     mach = machine(model, X)
 
-$DOC_IDF
+$DOC_TRANSFORMER_INPUTS
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed.
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms
+  that occur in `> max_doc_freq` documents will not be considered by the transformer. For
+  example, if `max_doc_freq` is set to 0.9, terms that are in more than 90% of the
+  documents will be removed.
 
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included.
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. Terms
+  that occur in `< max_doc_freq` documents will not be considered by the transformer. A
+  value of 0.01 means that only terms that are at least in 1% of the documents will be
+  included.
 
 - `κ=2`: The term frequency saturation characteristic. Higher values represent slower
   saturation. What we mean by saturation is the degree to which a term occurring extra
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
@@ -94,15 +94,15 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed.
-
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included.
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms
+  that occur in `> max_doc_freq` documents will not be considered by the transformer. For
+  example, if `max_doc_freq` is set to 0.9, terms that are in more than 90% of the
+  documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. Terms
+  that occur in `< max_doc_freq` documents will not be considered by the transformer. A
+  value of 0.01 means that only terms that are at least in 1% of the documents will be
+  included.
 
 # Operations