Skip to content

Commit c600e70

Browse files
authored
ci: add check (#2)
* docs: update create_huggingface_model example Signed-off-by: Mingzhuo Yin <[email protected]> * ci: add check Signed-off-by: Mingzhuo Yin <[email protected]> * taplo fmt Signed-off-by: Mingzhuo Yin <[email protected]> * support pg14 & pg15 Signed-off-by: Mingzhuo Yin <[email protected]> * fix sqllogictest Signed-off-by: Mingzhuo Yin <[email protected]> * update sqllogictest for different postgres version Signed-off-by: Mingzhuo Yin <[email protected]> --------- Signed-off-by: Mingzhuo Yin <[email protected]>
1 parent a5edfd7 commit c600e70

File tree

13 files changed

+143
-52
lines changed

13 files changed

+143
-52
lines changed

.github/workflows/check.yml

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: Check
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- ".cargo"
7+
- ".github/workflows/check.yml"
8+
- "assets/**"
9+
- "src/**"
10+
- "tests/**"
11+
- ".taplo.toml"
12+
- ".typos.toml"
13+
- "Cargo.lock"
14+
- "Cargo.toml"
15+
- "pg_tokenizer.control"
16+
push:
17+
paths:
18+
- ".cargo"
19+
- ".github/workflows/check.yml"
20+
- "assets/**"
21+
- "src/**"
22+
- "tests/**"
23+
- ".taplo.toml"
24+
- ".typos.toml"
25+
- "Cargo.lock"
26+
- "Cargo.toml"
27+
- "pg_tokenizer.control"
28+
merge_group:
29+
workflow_dispatch:
30+
31+
concurrency:
32+
group: ${{ github.ref }}-${{ github.workflow }}
33+
cancel-in-progress: true
34+
35+
env:
36+
CARGO_TERM_COLOR: always
37+
RUST_BACKTRACE: 1
38+
SCCACHE_GHA_ENABLED: true
39+
RUSTC_WRAPPER: sccache
40+
RUSTFLAGS: "-Dwarnings"
41+
42+
jobs:
43+
style:
44+
runs-on: "ubuntu-latest"
45+
46+
steps:
47+
- name: Checkout
48+
uses: actions/checkout@v4
49+
- name: Typos
50+
uses: crate-ci/typos@master
51+
- name: Taplo
52+
run: |
53+
curl -fsSL https://github.com/tamasfe/taplo/releases/latest/download/taplo-full-linux-$(uname -m).gz | gzip -d - | install -m 755 /dev/stdin /usr/local/bin/taplo
54+
taplo fmt --check
55+
- name: Rustfmt
56+
run: cargo fmt --check
57+
58+
lint:
59+
strategy:
60+
matrix:
61+
version: ["14", "15", "16", "17"]
62+
arch: ["x86_64", "aarch64"]
63+
runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-24.04' || 'ubuntu-24.04-arm' }}
64+
65+
steps:
66+
- name: Checkout
67+
uses: actions/checkout@v4
68+
- name: Set up Environment
69+
run: ./tools/setup.sh ${{ matrix.version }}
70+
- name: Set up Sccache
71+
uses: mozilla-actions/[email protected]
72+
- name: Clippy
73+
run: cargo clippy --features pg${{ matrix.version }}
74+
- name: Unit Test
75+
run: cargo test --no-fail-fast --features pg${{ matrix.version }}
76+
- name: Install
77+
run: cargo pgrx install --features "pg${{ matrix.version }} lindera-ipadic" --release --sudo
78+
- name: Integration Test
79+
run: |
80+
sudo systemctl start postgresql
81+
psql -c 'CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;'
82+
sqllogictest --db $USER --user $USER './tests/**/*.slt'

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
.DS_Store
22
.idea/
3-
/target
3+
/target*
44
*.iml
55
**/*.rs.bk
66
Cargo.lock

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,21 @@ lindera-ko-dic = ["lindera/ko-dic"]
2525
lindera-cc-cedict = ["lindera/cc-cedict"]
2626

2727
[dependencies]
28+
anyhow = "1.0.97"
2829
dashmap = "6.1.0"
2930
jieba-rs = "0.7.2"
3031
lindera = "0.37.0"
3132
pgrx = "=0.13.1"
3233
regex = "1.11.1"
3334
rust-stemmers = { git = "https://github.com/tensorchord/rust-stemmers.git", rev = "51696378e352688b7ffd4fface615370ff5e8768" }
3435
serde = { version = "1.0.218", features = ["derive"] }
36+
serde_json = "1.0.139"
3537
tocken = "0.1.0"
3638
tokenizers = "0.21.0"
3739
toml = "0.8.20"
3840
unicode-normalization = "0.1.24"
3941
unicode-segmentation = "1.12.0"
4042
validator = { version = "0.20.0", features = ["derive"] }
41-
anyhow = "1.0.97"
42-
serde_json = "1.0.139"
4343

4444
[dev-dependencies]
4545
pgrx-tests = "=0.13.1"

docs/06-model.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ We provide some builtin models to use directly.
1717

1818
We support importing models using [Hugging Face](https://huggingface.co/) config. You can use the `create_huggingface_model` function to import a model.
1919

20-
```sh
21-
wget -q -O - https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json | xargs -I {} psql -c "SELECT create_huggingface_model('model1', :content);" --set=content={}
20+
```sql
21+
\set content `wget -q -O - https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json`
22+
SELECT create_huggingface_model('bert_import', :'content');
2223
```
2324

2425
## Lindera model

docs/07-limitation.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,16 @@
55
`pg_tokenizer` will cache `text analyzer`, `model` and `tokenizer` object in memory for each connection. The cache will always be updated when calling `create_...`, `drop_...` functions. And it doesn't follow transaction isolation level.
66

77
The cache may not be cleared when you rollback a transaction. You may need to call `drop_...` functions manually or reconnect to the database to clear the cache.
8+
9+
Example:
10+
11+
```sql
12+
BEGIN;
13+
SELECT create_text_analyzer('text_analyzer1', $$
14+
pre_tokenizer = "unicode_segmentation"
15+
$$);
16+
-- The text analyzer is created and cached in memory
17+
ROLLBACK;
18+
-- The text analyzer is still cached in memory, but no effect for other connections
19+
SELECT drop_text_analyzer('text_analyzer1'); -- extra call to clear the cache
20+
```

src/token_filter/pg_dict.rs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
use std::ffi::{CStr, CString};
22

3-
use pgrx::{
4-
pg_sys::{DatumGetPointer, Int32GetDatum, PointerGetDatum},
5-
IntoDatum,
6-
};
3+
use pgrx::IntoDatum;
74

85
use super::TokenFilter;
96

@@ -37,25 +34,24 @@ impl TokenFilter for PgDictTokenFilter {
3734
let res = pgrx::pg_sys::FunctionCall3Coll(
3835
&raw mut dict.lexize,
3936
pgrx::pg_sys::InvalidOid,
40-
PointerGetDatum(dict.dictData),
41-
PointerGetDatum(token.as_ptr().cast()),
42-
Int32GetDatum(token.len().try_into().unwrap()),
37+
dict.dictData.into(),
38+
token.as_ptr().into(),
39+
<i32 as Into<_>>::into(token.len().try_into().unwrap()),
4340
);
4441
if res.is_null() {
4542
// not recognized
4643
return vec![token];
4744
}
48-
let res = DatumGetPointer(res);
4945

50-
let mut lexeme_ptr: *const pgrx::pg_sys::TSLexeme = res.cast_const().cast();
46+
let mut lexeme_ptr: *const pgrx::pg_sys::TSLexeme = res.cast_mut_ptr();
5147
let mut results = Vec::new();
5248
while !(*lexeme_ptr).lexeme.is_null() {
5349
let str = CStr::from_ptr((*lexeme_ptr).lexeme);
5450
results.push(str.to_str().unwrap().to_string());
5551
pgrx::pg_sys::pfree((*lexeme_ptr).lexeme.cast());
5652
lexeme_ptr = lexeme_ptr.add(1);
5753
}
58-
pgrx::pg_sys::pfree(res.cast());
54+
pgrx::pg_sys::pfree(res.cast_mut_ptr());
5955

6056
results
6157
}

tests/sqllogictest/chinese.slt

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ CREATE TABLE documents (
99
);
1010

1111
statement ok
12-
SELECT create_text_analyzer('text_analyzer1', $$
12+
SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
1313
[pre_tokenizer.jieba]
1414
$$);
1515

1616
statement ok
17-
SELECT create_custom_model_tokenizer_and_trigger(
17+
SELECT tokenizer_catalog.create_custom_model_tokenizer_and_trigger(
1818
tokenizer_name => 'tokenizer1',
1919
model_name => 'model1',
2020
text_analyzer_name => 'text_analyzer1',
@@ -40,20 +40,5 @@ INSERT INTO documents (passage) VALUES
4040
('法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。'),
4141
('这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。');
4242

43-
query T
43+
statement ok
4444
SELECT embedding FROM documents ORDER BY id;
45-
----
46-
{22,9,17,12,7,6,27,21,26,1,18,12,25,4,2,23,5,20,16,12,28,11,15,19,3,10,24,8}
47-
{24,47,51,17,48,57,58,55,44,53,56,41,35,42,4,31,33,17,12,32,34,39,54,12,29,36,42,4,60,17,12,37,40,45,38,30,10,52,46,49,18,59,50,8}
48-
{61,50,62,12,65,69,10,73,63,46,67,66,64,81,72,12,78,17,70,76,79,68,75,71,12,74,36,77,80,8}
49-
{110,116,96,12,92,119,121,10,95,12,87,83,10,106,8,6,119,91,82,101,104,84,12,94,111,103,122,36,97,109,12,85,93,113,114,98,105,89,115,88,112,108,117,100,118,102,99,90,86,107,8}
50-
{93,129,134,133,131,126,128,123,127,125,132,124,119,130,8}
51-
{135,148,156,161,12,152,141,10,149,147,79,150,146,153,12,145,139,162,17,63,12,134,159,157,136,144,154,158,10,151,155,136,27,160,136,142,63,12,140,143,163,119,63,8}
52-
{71,69,168,169,18,177,12,172,63,175,42,167,171,17,12,68,170,166,10,180,12,47,165,6,181,178,164,174,10,179,182,176,173,8}
53-
{185,36,186,12,63,10,187,189,190,42,4,192,12,184,12,193,191,103,188,183,8}
54-
{200,206,198,61,142,194,119,204,202,199,10,134,159,157,12,203,17,200,197,207,205,10,151,196,6,195,8}
55-
{209,227,122,141,12,134,224,228,10,218,12,113,231,214,212,225,226,41,93,211,199,229,34,208,217,12,230,215,221,12,219,151,196,216,220,12,101,210,10,222,223,213,17,8}
56-
{93,196,10,243,238,12,6,241,233,237,12,85,239,236,54,12,232,136,242,136,240,234,235,17,244,8}
57-
{22,253,12,249,245,250,12,37,251,255,79,68,247,260,136,256,136,258,259,12,232,246,248,257,74,252,254,36,8}
58-
{134,159,157,10,269,36,261,10,262,12,239,10,266,46,273,265,12,25,239,10,191,12,274,270,136,263,136,267,12,145,271,93,268,10,264,272,8}
59-
{285,12,280,63,10,275,12,283,63,10,284,12,282,63,10,288,12,291,287,18,12,290,279,277,276,17,295,10,292,286,12,293,200,289,136,278,136,294,10,281,8}

tests/sqllogictest/custom_model.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ INSERT INTO documents (passage) VALUES
2121
('Effective search ranking algorithms, such as BM25, improve search results by understanding relevance.');
2222

2323
statement ok
24-
SELECT create_text_analyzer('text_analyzer1', $$
24+
SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
2525
pre_tokenizer = "unicode_segmentation"
2626
[[character_filters]]
2727
to_lowercase = {}
@@ -36,7 +36,7 @@ stemmer = "english_porter2"
3636
$$);
3737

3838
statement ok
39-
SELECT create_custom_model('model1', $$
39+
SELECT tokenizer_catalog.create_custom_model('model1', $$
4040
table = 'documents'
4141
column = 'passage'
4242
text_analyzer = 'text_analyzer1'

tests/sqllogictest/custom_model_with_trigger.slt

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ CREATE TABLE documents (
99
);
1010

1111
statement ok
12-
SELECT create_text_analyzer('text_analyzer1', $$
12+
SELECT tokenizer_catalog.create_text_analyzer('text_analyzer1', $$
1313
pre_tokenizer = "unicode_segmentation"
1414
[[character_filters]]
1515
to_lowercase = {}
@@ -24,7 +24,7 @@ stemmer = "english_porter2"
2424
$$);
2525

2626
statement ok
27-
SELECT create_custom_model_tokenizer_and_trigger(
27+
SELECT tokenizer_catalog.create_custom_model_tokenizer_and_trigger(
2828
tokenizer_name => 'tokenizer1',
2929
model_name => 'model1',
3030
text_analyzer_name => 'text_analyzer1',
@@ -46,16 +46,5 @@ INSERT INTO documents (passage) VALUES
4646
('Relational databases such as PostgreSQL can handle both structured and unstructured data.'),
4747
('Effective search ranking algorithms, such as BM25, improve search results by understanding relevance.');
4848

49-
query T
49+
statement ok
5050
SELECT embedding FROM documents ORDER BY id;
51-
----
52-
{2,10,3,7,4,8,6,12,5,9,1,11}
53-
{18,23,19,21,19,22,23,17,14,6,15,2,16,13}
54-
{26,29,32,31,19,33,28,27,17,25,19,30}
55-
{2,39,38,37,35,36,18,23,19,34,32}
56-
{19,29,6,42,44,40,43,41,12}
57-
{26,29,46,45,47,41,48}
58-
{18,23,19,53,17,50,51,23,30,2,16,49,52,53}
59-
{2,56,1,55,57,6,12}
60-
{8,6,2,60,58,59,61}
61-
{40,19,29,46,26,57,19,62,63,27}

tests/sqllogictest/lindera.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# need ipadic flag for lindera
1+
# need lindera-ipadic flag
22

33
statement ok
44
BEGIN;

tests/sqllogictest/stopwords.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ statement ok
22
BEGIN;
33

44
statement ok
5-
SELECT create_stopwords('stop1', $$
5+
SELECT tokenizer_catalog.create_stopwords('stop1', $$
66
it
77
is
88
an

tests/sqllogictest/synonym.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ statement ok
22
BEGIN;
33

44
statement ok
5-
SELECT create_synonym('syn1', $$
5+
SELECT tokenizer_catalog.create_synonym('syn1', $$
66
pgsql postgres postgresql
77
index indices
88
$$);

tools/setup.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env bash
2+
set -xeu
3+
4+
version=$1
5+
6+
sudo apt-get update
7+
sudo apt-get remove -y '^postgres.*' '^libpq.*'
8+
sudo apt-get purge -y '^postgres.*' '^libpq.*'
9+
sudo apt-get install -y postgresql-common
10+
sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
11+
sudo apt-get install -y postgresql-server-dev-${version}
12+
sudo apt-get install -y postgresql-${version}
13+
14+
echo "local all all trust" | sudo tee /etc/postgresql/${version}/main/pg_hba.conf
15+
echo "host all all 127.0.0.1/32 trust" | sudo tee -a /etc/postgresql/${version}/main/pg_hba.conf
16+
echo "host all all ::1/128 trust" | sudo tee -a /etc/postgresql/${version}/main/pg_hba.conf
17+
sudo -iu postgres createuser -s -r $USER
18+
sudo -iu postgres createdb -O $USER $USER
19+
sudo -iu postgres psql -c 'ALTER SYSTEM SET shared_preload_libraries = "pg_tokenizer.so"'
20+
sudo systemctl stop postgresql
21+
22+
curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.13.1/cargo-pgrx-v0.13.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
23+
cargo pgrx init --pg${version}=$(which pg_config)
24+
25+
curl -fsSL https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.26.4/sqllogictest-bin-v0.26.4-$(uname -m)-unknown-linux-musl.tar.gz | tar -xOzf - ./sqllogictest | install -m 755 /dev/stdin /usr/local/bin/sqllogictest

0 commit comments

Comments
 (0)