ci: add check (#2)

silver-ymz · web-flow · commit c600e7062095 · 2025-03-23T19:40:44.000+08:00
* docs: update create_huggingface_model example

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

* ci: add check

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

* taplo fmt

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

* support pg14 &amp; pg15

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

* fix sqllogictest

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

* update sqllogictest for different postgres version

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;

---------

Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -0,0 +1,82 @@
+name: Check
+
+on:
+  pull_request:
+    paths:
+      - ".cargo"
+      - ".github/workflows/check.yml"
+      - "assets/**"
+      - "src/**"
+      - "tests/**"
+      - ".taplo.toml"
+      - ".typos.toml"
+      - "Cargo.lock"
+      - "Cargo.toml"
+      - "pg_tokenizer.control"
+  push:
+    paths:
+      - ".cargo"
+      - ".github/workflows/check.yml"
+      - "assets/**"
+      - "src/**"
+      - "tests/**"
+      - ".taplo.toml"
+      - ".typos.toml"
+      - "Cargo.lock"
+      - "Cargo.toml"
+      - "pg_tokenizer.control"
+  merge_group:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.ref }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+  SCCACHE_GHA_ENABLED: true
+  RUSTC_WRAPPER: sccache
+  RUSTFLAGS: "-Dwarnings"
+
+jobs:
+  style:
+    runs-on: "ubuntu-latest"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Typos
+        uses: crate-ci/typos@master
+      - name: Taplo
+        run: |
+          curl -fsSL https://github.com/tamasfe/taplo/releases/latest/download/taplo-full-linux-$(uname -m).gz | gzip -d - | install -m 755 /dev/stdin /usr/local/bin/taplo
+          taplo fmt --check
+      - name: Rustfmt
+        run: cargo fmt --check
+
+  lint:
+    strategy:
+      matrix:
+        version: ["14", "15", "16", "17"]
+        arch: ["x86_64", "aarch64"]
+    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-24.04' || 'ubuntu-24.04-arm' }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Environment
+        run: ./tools/setup.sh ${{ matrix.version }}
+      - name: Set up Sccache
+        uses: mozilla-actions/sccache-action@v0.0.7
+      - name: Clippy
+        run: cargo clippy --features pg${{ matrix.version }}
+      - name: Unit Test
+        run: cargo test --no-fail-fast --features pg${{ matrix.version }}
+      - name: Install
+        run: cargo pgrx install --features "pg${{ matrix.version }} lindera-ipadic" --release --sudo
+      - name: Integration Test
+        run: |
+          sudo systemctl start postgresql
+          psql -c 'CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;'
+          sqllogictest --db $USER --user $USER './tests/**/*.slt'
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 .DS_Store
 .idea/
-/target
+/target*
 *.iml
 **/*.rs.bk
 Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,21 +25,21 @@ lindera-ko-dic = ["lindera/ko-dic"]
 lindera-cc-cedict = ["lindera/cc-cedict"]
 
 [dependencies]
+anyhow = "1.0.97"
 dashmap = "6.1.0"
 jieba-rs = "0.7.2"
 lindera = "0.37.0"
 pgrx = "=0.13.1"
 regex = "1.11.1"
 rust-stemmers = { git = "https://github.com/tensorchord/rust-stemmers.git", rev = "51696378e352688b7ffd4fface615370ff5e8768" }
 serde = { version = "1.0.218", features = ["derive"] }
+serde_json = "1.0.139"
 tocken = "0.1.0"
 tokenizers = "0.21.0"
 toml = "0.8.20"
 unicode-normalization = "0.1.24"
 unicode-segmentation = "1.12.0"
 validator = { version = "0.20.0", features = ["derive"] }
-anyhow = "1.0.97"
-serde_json = "1.0.139"
 
 [dev-dependencies]
 pgrx-tests = "=0.13.1"
diff --git a/docs/06-model.md b/docs/06-model.md
@@ -17,8 +17,9 @@ We provide some builtin models to use directly.
 
 We support importing models using [Hugging Face](https://huggingface.co/) config. You can use the `create_huggingface_model` function to import a model.
 
-```sh
-wget -q -O - https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json | xargs -I {} psql -c "SELECT create_huggingface_model('model1', :content);" --set=content={}
+```sql
+\set content `wget -q -O - https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json`
+SELECT create_huggingface_model('bert_import', :'content');
 ```
 
 ## Lindera model
diff --git a/docs/07-limitation.md b/docs/07-limitation.md
@@ -5,3 +5,16 @@
 `pg_tokenizer` will cache `text analyzer`, `model` and `tokenizer` object in memory for each connection. The cache will always be updated when calling `create_...`, `drop_...` functions. And it doesn't follow transaction isolation level.
 
 The cache may not be cleared when you rollback a transaction. You may need to call `drop_...` functions manually or reconnect to the database to clear the cache.
+
+Example:
+
+```sql
+BEGIN;
+SELECT create_text_analyzer('text_analyzer1', $$
+pre_tokenizer = "unicode_segmentation"
+$$);
+-- The text analyzer is created and cached in memory
+ROLLBACK;
+-- The text analyzer is still cached in memory, but no effect for other connections
+SELECT drop_text_analyzer('text_analyzer1');  -- extra call to clear the cache
+```
diff --git a/src/token_filter/pg_dict.rs b/src/token_filter/pg_dict.rs
@@ -1,9 +1,6 @@
 use std::ffi::{CStr, CString};
 
-use pgrx::{
-    pg_sys::{DatumGetPointer, Int32GetDatum, PointerGetDatum},
-    IntoDatum,
-};
+use pgrx::IntoDatum;
 
 use super::TokenFilter;
 
@@ -37,25 +34,24 @@ impl TokenFilter for PgDictTokenFilter {
             let res = pgrx::pg_sys::FunctionCall3Coll(
                 &raw mut dict.lexize,
                 pgrx::pg_sys::InvalidOid,
-                PointerGetDatum(dict.dictData),
-                PointerGetDatum(token.as_ptr().cast()),
-                Int32GetDatum(token.len().try_into().unwrap()),
+                dict.dictData.into(),
+                token.as_ptr().into(),
+                <i32 as Into<_>>::into(token.len().try_into().unwrap()),
             );
             if res.is_null() {
                 // not recognized
                 return vec![token];
             }
-            let res = DatumGetPointer(res);
 
-            let mut lexeme_ptr: *const pgrx::pg_sys::TSLexeme = res.cast_const().cast();
+            let mut lexeme_ptr: *const pgrx::pg_sys::TSLexeme = res.cast_mut_ptr();
             let mut results = Vec::new();
             while !(*lexeme_ptr).lexeme.is_null() {
                 let str = CStr::from_ptr((*lexeme_ptr).lexeme);
                 results.push(str.to_str().unwrap().to_string());
                 pgrx::pg_sys::pfree((*lexeme_ptr).lexeme.cast());
                 lexeme_ptr = lexeme_ptr.add(1);
             }
-            pgrx::pg_sys::pfree(res.cast());
+            pgrx::pg_sys::pfree(res.cast_mut_ptr());
 
             results
         }
diff --git a/tests/sqllogictest/chinese.slt b/tests/sqllogictest/chinese.slt
@@ -9,12 +9,12 @@ CREATE TABLE documents (
 );
 
 statement ok
-SELECT create_text_analyzer('text_analyzer1', $$
+SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
 [pre_tokenizer.jieba]
 $$);
 
 statement ok
-SELECT create_custom_model_tokenizer_and_trigger(
+SELECT tokenizer_catalog.create_custom_model_tokenizer_and_trigger(
     tokenizer_name => 'tokenizer1',
     model_name => 'model1',
     text_analyzer_name => 'text_analyzer1',
@@ -40,20 +40,5 @@ INSERT INTO documents (passage) VALUES
 ('法国人的思想是有名的清楚，他的文章也明白干净，但是他的做事，无不混乱、肮脏、喧哗，但看这船上的乱糟糟。'),
 ('这船，倚仗人的机巧，载满人的扰攘，寄满人的希望，热闹地行着，每分钟把沾污了人气的一小方小面，还给那无情、无尽、无际的大海。');
 
-query T
+statement ok
 SELECT embedding FROM documents ORDER BY id;
-----
-{22,9,17,12,7,6,27,21,26,1,18,12,25,4,2,23,5,20,16,12,28,11,15,19,3,10,24,8}
-{24,47,51,17,48,57,58,55,44,53,56,41,35,42,4,31,33,17,12,32,34,39,54,12,29,36,42,4,60,17,12,37,40,45,38,30,10,52,46,49,18,59,50,8}
-{61,50,62,12,65,69,10,73,63,46,67,66,64,81,72,12,78,17,70,76,79,68,75,71,12,74,36,77,80,8}
-{110,116,96,12,92,119,121,10,95,12,87,83,10,106,8,6,119,91,82,101,104,84,12,94,111,103,122,36,97,109,12,85,93,113,114,98,105,89,115,88,112,108,117,100,118,102,99,90,86,107,8}
-{93,129,134,133,131,126,128,123,127,125,132,124,119,130,8}
-{135,148,156,161,12,152,141,10,149,147,79,150,146,153,12,145,139,162,17,63,12,134,159,157,136,144,154,158,10,151,155,136,27,160,136,142,63,12,140,143,163,119,63,8}
-{71,69,168,169,18,177,12,172,63,175,42,167,171,17,12,68,170,166,10,180,12,47,165,6,181,178,164,174,10,179,182,176,173,8}
-{185,36,186,12,63,10,187,189,190,42,4,192,12,184,12,193,191,103,188,183,8}
-{200,206,198,61,142,194,119,204,202,199,10,134,159,157,12,203,17,200,197,207,205,10,151,196,6,195,8}
-{209,227,122,141,12,134,224,228,10,218,12,113,231,214,212,225,226,41,93,211,199,229,34,208,217,12,230,215,221,12,219,151,196,216,220,12,101,210,10,222,223,213,17,8}
-{93,196,10,243,238,12,6,241,233,237,12,85,239,236,54,12,232,136,242,136,240,234,235,17,244,8}
-{22,253,12,249,245,250,12,37,251,255,79,68,247,260,136,256,136,258,259,12,232,246,248,257,74,252,254,36,8}
-{134,159,157,10,269,36,261,10,262,12,239,10,266,46,273,265,12,25,239,10,191,12,274,270,136,263,136,267,12,145,271,93,268,10,264,272,8}
-{285,12,280,63,10,275,12,283,63,10,284,12,282,63,10,288,12,291,287,18,12,290,279,277,276,17,295,10,292,286,12,293,200,289,136,278,136,294,10,281,8}
diff --git a/tests/sqllogictest/custom_model.slt b/tests/sqllogictest/custom_model.slt
@@ -21,7 +21,7 @@ INSERT INTO documents (passage) VALUES
 ('Effective search ranking algorithms, such as BM25, improve search results by understanding relevance.');
 
 statement ok
-SELECT create_text_analyzer('text_analyzer1', $$
+SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
 pre_tokenizer = "unicode_segmentation"
 [[character_filters]]
 to_lowercase = {}
@@ -36,7 +36,7 @@ stemmer = "english_porter2"
 $$);
 
 statement ok
-SELECT create_custom_model('model1', $$
+SELECT tokenizer_catalog.create_custom_model('model1', $$
 table = 'documents'
 column = 'passage'
 text_analyzer = 'text_analyzer1'
diff --git a/tests/sqllogictest/custom_model_with_trigger.slt b/tests/sqllogictest/custom_model_with_trigger.slt
@@ -9,7 +9,7 @@ CREATE TABLE documents (
 );
 
 statement ok
-SELECT create_text_analyzer('text_analyzer1', $$
+SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
 pre_tokenizer = "unicode_segmentation"
 [[character_filters]]
 to_lowercase = {}
@@ -24,7 +24,7 @@ stemmer = "english_porter2"
 $$);
 
 statement ok
-SELECT create_custom_model_tokenizer_and_trigger(
+SELECT tokenizer_catalog.create_custom_model_tokenizer_and_trigger(
     tokenizer_name => 'tokenizer1',
     model_name => 'model1',
     text_analyzer_name => 'text_analyzer1',
@@ -46,16 +46,5 @@ INSERT INTO documents (passage) VALUES
 ('Relational databases such as PostgreSQL can handle both structured and unstructured data.'),
 ('Effective search ranking algorithms, such as BM25, improve search results by understanding relevance.');
 
-query T
+statement ok
 SELECT embedding FROM documents ORDER BY id;
-----
-{2,10,3,7,4,8,6,12,5,9,1,11}
-{18,23,19,21,19,22,23,17,14,6,15,2,16,13}
-{26,29,32,31,19,33,28,27,17,25,19,30}
-{2,39,38,37,35,36,18,23,19,34,32}
-{19,29,6,42,44,40,43,41,12}
-{26,29,46,45,47,41,48}
-{18,23,19,53,17,50,51,23,30,2,16,49,52,53}
-{2,56,1,55,57,6,12}
-{8,6,2,60,58,59,61}
-{40,19,29,46,26,57,19,62,63,27}
diff --git a/tests/sqllogictest/lindera.slt b/tests/sqllogictest/lindera.slt
@@ -1,4 +1,4 @@
-# need ipadic flag for lindera
+# need lindera-ipadic flag
 
 statement ok
 BEGIN;
diff --git a/tests/sqllogictest/stopwords.slt b/tests/sqllogictest/stopwords.slt
@@ -2,7 +2,7 @@ statement ok
 BEGIN;
 
 statement ok
-SELECT create_stopwords('stop1', $$
+SELECT tokenizer_catalog.create_stopwords('stop1', $$
 it
 is
 an
diff --git a/tests/sqllogictest/synonym.slt b/tests/sqllogictest/synonym.slt
@@ -2,7 +2,7 @@ statement ok
 BEGIN;
 
 statement ok
-SELECT create_synonym('syn1', $$
+SELECT tokenizer_catalog.create_synonym('syn1', $$
 pgsql postgres postgresql
 index indices
 $$);
diff --git a/tools/setup.sh b/tools/setup.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -xeu
+
+version=$1
+
+sudo apt-get update
+sudo apt-get remove -y '^postgres.*' '^libpq.*'
+sudo apt-get purge -y '^postgres.*' '^libpq.*'
+sudo apt-get install -y postgresql-common
+sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
+sudo apt-get install -y postgresql-server-dev-${version}
+sudo apt-get install -y postgresql-${version}
+
+echo "local all all trust" | sudo tee /etc/postgresql/${version}/main/pg_hba.conf
+echo "host all all 127.0.0.1/32 trust" | sudo tee -a /etc/postgresql/${version}/main/pg_hba.conf
+echo "host all all ::1/128 trust" | sudo tee -a /etc/postgresql/${version}/main/pg_hba.conf
+sudo -iu postgres createuser -s -r $USER
+sudo -iu postgres createdb -O $USER $USER
+sudo -iu postgres psql -c 'ALTER SYSTEM SET shared_preload_libraries = "pg_tokenizer.so"'
+sudo systemctl stop postgresql
+
+curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.13.1/cargo-pgrx-v0.13.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
+cargo pgrx init --pg${version}=$(which pg_config)
+
+curl -fsSL https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.26.4/sqllogictest-bin-v0.26.4-$(uname -m)-unknown-linux-musl.tar.gz | tar -xOzf - ./sqllogictest | install -m 755 /dev/stdin /usr/local/bin/sqllogictest

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# need ipadic flag for lindera`
	`1`	`+# need lindera-ipadic flag`
`2`	`2`
`3`	`3`	`statement ok`
`4`	`4`	`BEGIN;`