Skip to content

Commit a5edfd7

Browse files
committed
feat: custom_model_tokenizer able to store bm25vector
Signed-off-by: Mingzhuo Yin <[email protected]>
1 parent 5b7779d commit a5edfd7

File tree

3 files changed

+29
-29
lines changed

3 files changed

+29
-29
lines changed

Cargo.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ serde_json = "1.0.139"
4545
pgrx-tests = "=0.13.1"
4646

4747
[patch.crates-io]
48-
pgrx = { git = "https://github.com/silver-ymz/pgrx", branch = "chore/add-ts_cache.h" }
48+
pgrx = { git = "https://github.com/silver-ymz/pgrx", branch = "patch-to-pg_tokenizer" }
4949

5050
[profile.release]
5151
opt-level = 3
@@ -57,7 +57,3 @@ inherits = "dev"
5757
opt-level = 3
5858
lto = "thin"
5959
codegen-units = 8
60-
61-
[profile.dev.package]
62-
insta.opt-level = 3
63-
similar.opt-level = 3

docs/04-usage.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Details of `text analyzer` configuration can be found in the [text analyzer](05-
8282
### Model
8383

8484
`model` has 2 main types, `pre-trained`, `custom`.
85-
- `pre-trained` models have pre-trained vocab lists and some pre-defined tokenization rules. e.g. [`bert_base_uncased`](https://huggingface.co/google-bert/bert-base-uncased), `wiki_tocken`(https://huggingface.co/datasets/iohadrubin/wikitext-103-raw-v1).
85+
- `pre-trained` models have pre-trained vocab lists and some pre-defined tokenization rules. e.g. [`bert_base_uncased`](https://huggingface.co/google-bert/bert-base-uncased), [`wiki_tocken`](https://huggingface.co/datasets/iohadrubin/wikitext-103-raw-v1).
8686
- `custom` models will maintain their own vocab mapping. You can build a custom model based on your own corpus easily.
8787

8888
> Note that some models may have similar processes as `text analyzer`, so you can skip the `text analyzer` configuration for these models.

src/model/custom.rs

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -302,29 +302,33 @@ fn custom_model_tokenizer_set_target_column_trigger<'a>(
302302
new.set_by_index(idx, target)
303303
.expect("set target column failed");
304304
} else {
305-
panic!(
306-
"Unsupported target column type: {}",
307-
lookup_type_name(attoid)
308-
);
309-
// TODO: cast it using spi, waiting for pgrx update
310-
// let target_casted = pgrx::Spi::connect(|client| {
311-
// let tuptable = client
312-
// .select(
313-
// &format!("SELECT $1::{}", lookup_type_name(attoid)),
314-
// Some(1),
315-
// &[target.into()],
316-
// )
317-
// .unwrap_or_report();
318-
319-
// tuptable
320-
// .first()
321-
// .get_datum_by_ordinal(1)
322-
// .unwrap_or_report()
323-
// .unwrap()
324-
// });
325-
326-
// new.set_by_index(idx, target_casted)
327-
// .expect("set target column failed");
305+
let target_casted = pgrx::Spi::connect(|client| {
306+
client
307+
.select(
308+
&format!("SELECT $1::{}", lookup_type_name(attoid)),
309+
Some(1),
310+
&[target.into()],
311+
)
312+
.unwrap_or_report();
313+
314+
unsafe {
315+
let table = pgrx::pg_sys::SPI_tuptable.as_mut().unwrap();
316+
if table.numvals != 1 {
317+
panic!("unexpected number of tuples returned");
318+
}
319+
let heap_tuple = *(table.vals);
320+
let heap_tuple = pgrx::pg_sys::SPI_copytuple(heap_tuple);
321+
322+
let mut is_null = false;
323+
let datum = pgrx::pg_sys::SPI_getbinval(heap_tuple, table.tupdesc, 1, &mut is_null);
324+
325+
if is_null {
326+
panic!("unexpected null value");
327+
}
328+
datum
329+
}
330+
});
331+
unsafe { new.set_by_index_unchecked(idx, Some(target_casted)) };
328332
}
329333

330334
Ok(Some(new))

0 commit comments

Comments
 (0)