feat: custom_model_tokenizer able to store bm25vector

silver-ymz · silver-ymz · commit a5edfd7bc191 · 2025-03-21T18:38:13.000+08:00
Signed-off-by: Mingzhuo Yin &lt;yinmingzhuo@gmail.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -45,7 +45,7 @@ serde_json = "1.0.139"
 pgrx-tests = "=0.13.1"
 
 [patch.crates-io]
-pgrx = { git = "https://github.com/silver-ymz/pgrx", branch = "chore/add-ts_cache.h" }
+pgrx = { git = "https://github.com/silver-ymz/pgrx", branch = "patch-to-pg_tokenizer" }
 
 [profile.release]
 opt-level = 3
@@ -57,7 +57,3 @@ inherits = "dev"
 opt-level = 3
 lto = "thin"
 codegen-units = 8
-
-[profile.dev.package]
-insta.opt-level = 3
-similar.opt-level = 3
diff --git a/docs/04-usage.md b/docs/04-usage.md
@@ -82,7 +82,7 @@ Details of `text analyzer` configuration can be found in the [text analyzer](05-
 ### Model
 
 `model` has 2 main types, `pre-trained`, `custom`.
-- `pre-trained` models have pre-trained vocab lists and some pre-defined tokenization rules. e.g. [`bert_base_uncased`](https://huggingface.co/google-bert/bert-base-uncased), `wiki_tocken`(https://huggingface.co/datasets/iohadrubin/wikitext-103-raw-v1).
+- `pre-trained` models have pre-trained vocab lists and some pre-defined tokenization rules. e.g. [`bert_base_uncased`](https://huggingface.co/google-bert/bert-base-uncased), [`wiki_tocken`](https://huggingface.co/datasets/iohadrubin/wikitext-103-raw-v1).
 - `custom` models will maintain their own vocab mapping. You can build a custom model based on your own corpus easily.
 
 > Note that some models may have similar processes as `text analyzer`, so you can skip the `text analyzer` configuration for these models.
diff --git a/src/model/custom.rs b/src/model/custom.rs
@@ -302,29 +302,33 @@ fn custom_model_tokenizer_set_target_column_trigger<'a>(
         new.set_by_index(idx, target)
             .expect("set target column failed");
     } else {
-        panic!(
-            "Unsupported target column type: {}",
-            lookup_type_name(attoid)
-        );
-        // TODO: cast it using spi, waiting for pgrx update
-        // let target_casted = pgrx::Spi::connect(|client| {
-        //     let tuptable = client
-        //         .select(
-        //             &format!("SELECT $1::{}", lookup_type_name(attoid)),
-        //             Some(1),
-        //             &[target.into()],
-        //         )
-        //         .unwrap_or_report();
-
-        //     tuptable
-        //         .first()
-        //         .get_datum_by_ordinal(1)
-        //         .unwrap_or_report()
-        //         .unwrap()
-        // });
-
-        // new.set_by_index(idx, target_casted)
-        //     .expect("set target column failed");
+        let target_casted = pgrx::Spi::connect(|client| {
+            client
+                .select(
+                    &format!("SELECT $1::{}", lookup_type_name(attoid)),
+                    Some(1),
+                    &[target.into()],
+                )
+                .unwrap_or_report();
+
+            unsafe {
+                let table = pgrx::pg_sys::SPI_tuptable.as_mut().unwrap();
+                if table.numvals != 1 {
+                    panic!("unexpected number of tuples returned");
+                }
+                let heap_tuple = *(table.vals);
+                let heap_tuple = pgrx::pg_sys::SPI_copytuple(heap_tuple);
+
+                let mut is_null = false;
+                let datum = pgrx::pg_sys::SPI_getbinval(heap_tuple, table.tupdesc, 1, &mut is_null);
+
+                if is_null {
+                    panic!("unexpected null value");
+                }
+                datum
+            }
+        });
+        unsafe { new.set_by_index_unchecked(idx, Some(target_casted)) };
     }
 
     Ok(Some(new))