cactus-compute · harshaljanjani · Nov 29, 2025 · Nov 29, 2025 · Nov 29, 2025 · Nov 29, 2025
diff --git a/cactus/engine/engine.h b/cactus/engine/engine.h
@@ -85,7 +85,7 @@ struct Config {
     float max_pixels_tolerance = 2.0f;
     bool do_image_splitting = true;
 
-    enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7};
+    enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, PHI3 = 8};
     ModelType model_type = ModelType::QWEN;
 
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -156,7 +156,7 @@ class Tokenizer {
     void set_corpus_dir(const std::string& dir) { corpus_dir_ = dir; }
 
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT, WHISPER};
+    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT, WHISPER, PHI3};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -174,6 +174,7 @@ class Tokenizer {
     std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_smol_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
+    std::string format_phi3_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
 };
 
 class BPETokenizer : public Tokenizer {

diff --git a/cactus/engine/engine_model.cpp b/cactus/engine/engine_model.cpp
@@ -381,6 +381,7 @@ bool Config::from_json(const std::string& config_path) {
             if (value == "gemma" || value == "GEMMA") model_type = ModelType::GEMMA;
             else if (value == "lfm2" || value == "LFM2") model_type = ModelType::LFM2;
             else if (value == "smol" || value == "SMOL" || value == "Smol") model_type = ModelType::SMOL;
+            else if (value == "phi3" || value == "PHI3" || value == "Phi3") model_type = ModelType::PHI3;
             else if (value == "bert" || value == "BERT") model_type = ModelType::NOMIC;
             else if (value == "whisper" || value == "WHISPER") model_type = ModelType::WHISPER;
             else model_type = ModelType::QWEN;
@@ -477,6 +478,8 @@ std::unique_ptr<Model> create_model(const std::string& model_folder) {
             return std::make_unique<LFM2Model>(config);
         case Config::ModelType::SMOL:
             return std::make_unique<SmolModel>(config);
+        case Config::ModelType::PHI3:
+            return std::make_unique<Phi3Model>(config);
         case Config::ModelType::NOMIC:
             return std::make_unique<NomicModel>(config);
         case Config::ModelType::WHISPER:

diff --git a/cactus/engine/engine_sp.cpp b/cactus/engine/engine_sp.cpp
@@ -215,7 +215,8 @@ std::string SPTokenizer::preprocess_text(const std::string& text) const {
 
     for (size_t i = text.find_first_not_of(" "); i < text.length(); i++) {
         char c = text[i];
-        if (c == ' ') {
+        // Phi-3 treats newlines like spaces in SentencePiece encoding
+        if (c == ' ' || (c == '\n' && model_type_ == ModelType::PHI3)) {
             processed += "▁";
         } else {
             processed += c;
@@ -383,6 +384,14 @@ std::vector<std::string> SPTokenizer::split_with_special_tokens(const std::strin
             }
             result.push_back(best_special_token);
             start = best_match_pos + best_match_len;
+            // Phi-3 specific: rstrip whitespace after special tokens (except <|endoftext|>)
+            // This matches `rstrip=True` behavior for Phi-3 chat tokens
+            if (model_type_ == ModelType::PHI3 && best_special_token != "<|endoftext|>") {
+                while (start < text.size() && (text[start] == ' ' || text[start] == '\n' ||
+                       text[start] == '\t' || text[start] == '\r')) {
+                    start++;
+                }
+            }
         } else {
             if (start < text.size()) {
                 result.push_back(text.substr(start));

diff --git a/cactus/engine/engine_tokenizer.cpp b/cactus/engine/engine_tokenizer.cpp
@@ -37,6 +37,9 @@ void Tokenizer::detect_model_type(const std::string& config_path) {
             } else if (line.find("whisper") != std::string::npos) {
                 model_type_ = ModelType::WHISPER;
                 break;
+            } else if (line.find("phi3") != std::string::npos || line.find("phi-3") != std::string::npos) {
+                model_type_ = ModelType::PHI3;
+                break;
             } else {
                 model_type_ = ModelType::UNKNOWN;
             } 
@@ -93,6 +96,8 @@ std::string Tokenizer::format_chat_prompt(const std::vector<ChatMessage>& messag
             return format_lfm2_style(messages, add_generation_prompt, tools_json);
         case ModelType::SMOL:
             return format_smol_style(messages, add_generation_prompt, tools_json);
+        case ModelType::PHI3:
+            return format_phi3_style(messages, add_generation_prompt, tools_json);
         default:
             return format_qwen_style(messages, add_generation_prompt, tools_json);
     }
@@ -413,6 +418,32 @@ std::string Tokenizer::format_smol_style(const std::vector<ChatMessage>& message
     return result;
 }
 
+std::string Tokenizer::format_phi3_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const {
+    if (!tools_json.empty()) {
+        return "ERROR: Tool calls are currently not supported for Phi-3 models";
+    }
+
+    std::string result;
+
+    for (const auto& msg : messages) {
+        if (msg.role == "system") {
+            result += "<|system|>\n";
+        } else if (msg.role == "user") {
+            result += "<|user|>\n";
+        } else if (msg.role == "assistant") {
+            result += "<|assistant|>\n";
+        }
+        result += msg.content;
+        result += "<|end|>\n";
+    }
+
+    if (add_generation_prompt) {
+        result += "<|assistant|>\n";
+    }
+
+    return result;
+}
+
 
 } // namespace engine
 } // namespace cactus
diff --git a/cactus/models/model.h b/cactus/models/model.h
@@ -138,6 +138,47 @@ class SmolModel : public Model{
 };
 
 
+class Phi3Model : public Model {
+public:
+    Phi3Model();
+    explicit Phi3Model(const Config& config);
+    ~Phi3Model() override = default;
+
+protected:
+    size_t build_attention(CactusGraph* gb, size_t normalized_input, uint32_t layer_idx,
+                          ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) override;
+
+    size_t build_mlp(CactusGraph* gb, size_t normalized_h, uint32_t layer_idx,
+                    ComputeBackend backend) const override;
+
+    size_t build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
+                                  ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) override;
+
+    size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) override;
+    void load_weights_to_graph(CactusGraph* gb) override;
+
+private:
+    struct WeightNodeIDs {
+        size_t output_weight;
+        size_t output_norm_weight;
+
+        struct LayerWeights {
+            size_t attn_q_weight;
+            size_t attn_k_weight;
+            size_t attn_v_weight;
+            size_t attn_output_weight;
+            size_t input_layernorm_weight;
+            size_t ffn_gate_weight;
+            size_t ffn_up_weight;
+            size_t ffn_down_weight;
+            size_t post_attention_layernorm_weight;
+        };
+
+        std::vector<LayerWeights> layers;
+    } weight_nodes_;
+};
+
+
 class Siglip2VisionModel : public Model {
     friend class Lfm2VlModel;  
 

diff --git a/cactus/models/model_phi.cpp b/cactus/models/model_phi.cpp
@@ -0,0 +1,165 @@
+#include "model.h"
+#include "../graph/graph.h"
+#include <cmath>
+#include <stdexcept>
+#include <set>
+
+namespace cactus {
+namespace engine {
+
+Phi3Model::Phi3Model() : Model() {}
+
+Phi3Model::Phi3Model(const Config& config) : Model(config) {
+    weight_nodes_.layers.resize(config.num_layers);
+}
+
+void Phi3Model::load_weights_to_graph(CactusGraph* gb) {
+    embedding_node_id_ = gb->mmap_embeddings(embedding_file_path_);
+    weight_nodes_.output_norm_weight = gb->mmap_weights(model_folder_path_ + "/output_norm.weights");
+
+    if (config_.tie_word_embeddings) {
+        weight_nodes_.output_weight = embedding_node_id_;
+        output_weight_node_id_ = embedding_node_id_;
+    } else {
+        weight_nodes_.output_weight = gb->mmap_weights(model_folder_path_ + "/output_weight.weights");
+        output_weight_node_id_ = weight_nodes_.output_weight;
+    }
+
+    for (uint32_t i = 0; i < config_.num_layers; i++) {
+        auto& layer = weight_nodes_.layers[i];
+        std::string layer_prefix = model_folder_path_ + "/layer_" + std::to_string(i) + "_";
+        layer.attn_q_weight = gb->mmap_weights(layer_prefix + "attn_q.weights");
+        layer.attn_k_weight = gb->mmap_weights(layer_prefix + "attn_k.weights");
+        layer.attn_v_weight = gb->mmap_weights(layer_prefix + "attn_v.weights");
+        layer.attn_output_weight = gb->mmap_weights(layer_prefix + "attn_output.weights");
+        layer.input_layernorm_weight = gb->mmap_weights(layer_prefix + "input_norm.weights");
+        layer.ffn_gate_weight = gb->mmap_weights(layer_prefix + "ffn_gate.weights");
+        layer.ffn_up_weight = gb->mmap_weights(layer_prefix + "ffn_up.weights");
+        layer.ffn_down_weight = gb->mmap_weights(layer_prefix + "ffn_down.weights");
+        layer.post_attention_layernorm_weight = gb->mmap_weights(layer_prefix + "post_attn_norm.weights");
+    }
+}
+
+size_t Phi3Model::build_attention(CactusGraph* gb, size_t normalized_input, uint32_t layer_idx,
+                                  ComputeBackend backend, bool use_cache, size_t position_offset) {
+    const auto& layer = weight_nodes_.layers[layer_idx];
+
+    auto q_proj = gb->matmul(normalized_input, layer.attn_q_weight, true, backend);
+    auto k_proj = gb->matmul(normalized_input, layer.attn_k_weight, true, backend);
+    auto v_proj = gb->matmul(normalized_input, layer.attn_v_weight, true, backend);
+
+    const auto& q_shape = gb->get_output_buffer(q_proj).shape;
+    size_t seq_len = q_shape[0];
+
+    auto q_proj_4d = gb->reshape(q_proj, {1, seq_len, config_.attention_heads, config_.attention_head_dim});
+    auto k_proj_4d = gb->reshape(k_proj, {1, seq_len, config_.attention_kv_heads, config_.attention_head_dim});
+    auto v_proj_4d = gb->reshape(v_proj, {1, seq_len, config_.attention_kv_heads, config_.attention_head_dim});
+
+    if (config_.rope_theta > 0) {
+        q_proj_4d = gb->rope(q_proj_4d, config_.rope_theta, position_offset);
+        k_proj_4d = gb->rope(k_proj_4d, config_.rope_theta, position_offset);
+    }
+
+    size_t final_k = k_proj_4d;
+    size_t final_v = v_proj_4d;
+
+    if (use_cache && !kv_cache_.is_empty()) {
+        auto k_view = kv_cache_.get_key_view(layer_idx);
+        auto v_view = kv_cache_.get_value_view(layer_idx);
+
+        if (k_view.ptr2 == nullptr && v_view.ptr2 == nullptr) {
+            size_t cache_k_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
+            size_t cache_v_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
+
+            gb->set_input(cache_k_node, k_view.ptr1, kv_cache_.precision);
+            gb->set_input(cache_v_node, v_view.ptr1, kv_cache_.precision);
+
+            final_k = gb->concat(cache_k_node, k_proj_4d, 1);
+            final_v = gb->concat(cache_v_node, v_proj_4d, 1);
+        } else {
+            size_t cache_k_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
+            size_t cache_v_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
+
+            gb->set_input(cache_k_node, kv_cache_.get_key_ptr(layer_idx), kv_cache_.precision);
+            gb->set_input(cache_v_node, kv_cache_.get_value_ptr(layer_idx), kv_cache_.precision);
+
+            final_k = gb->concat(cache_k_node, k_proj_4d, 1);
+            final_v = gb->concat(cache_v_node, v_proj_4d, 1);
+        }
+    }
+
+    if (use_cache) {
+        cache_k_output_nodes_[layer_idx] = final_k;
+        cache_v_output_nodes_[layer_idx] = final_v;
+    }
+
+    auto attn_output_4d = gb->attention(q_proj_4d, final_k, final_v, attention_scale_, position_offset);
+    auto attn_output = gb->reshape(attn_output_4d, {seq_len, config_.attention_head_dim * config_.attention_heads});
+    return gb->matmul(attn_output, layer.attn_output_weight, true, backend);
+}
+
+size_t Phi3Model::build_mlp(CactusGraph* gb, size_t normalized_h, uint32_t layer_idx,
+                            ComputeBackend backend) const {
+    const auto& layer = weight_nodes_.layers[layer_idx];
+    size_t gate_output = gb->matmul(normalized_h, layer.ffn_gate_weight, true, backend);
+    size_t up_output = gb->matmul(normalized_h, layer.ffn_up_weight, true, backend);
+    size_t gate_silu = gb->silu(gate_output);
+    size_t gated = gb->multiply(gate_silu, up_output);
+    return gb->matmul(gated, layer.ffn_down_weight, true, backend);
+}
+
+size_t Phi3Model::build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
+                                          ComputeBackend backend, bool use_cache, size_t position_offset) {
+    const auto& layer = weight_nodes_.layers[layer_idx];
+    auto normalized_input = gb->rms_norm(hidden, layer.input_layernorm_weight, config_.layer_norm_eps);
+    auto attn_output = build_attention(gb, normalized_input, layer_idx, backend, use_cache, position_offset);
+    auto after_attention = gb->add(hidden, attn_output);
+    auto normalized_after_attention = gb->rms_norm(after_attention, layer.post_attention_layernorm_weight, config_.layer_norm_eps);
+    auto mlp_output = build_mlp(gb, normalized_after_attention, layer_idx, backend);
+    return gb->add(after_attention, mlp_output);
+}
+
+size_t Phi3Model::forward(const std::vector<uint32_t>& tokens, bool use_cache) {
+    if (!initialized_ || !graph_handle_) {
+        throw std::runtime_error("Model not initialized - call init() first");
+    }
+
+    if (tokens.empty()) {
+        throw std::runtime_error("Token sequence cannot be empty");
+    }
+
+    auto* gb = static_cast<CactusGraph*>(graph_handle_);
+    gb->soft_reset();
+
+    auto seq_len = static_cast<size_t>(tokens.size());
+
+    size_t position_offset = use_cache ? kv_cache_.get_total_seq_len() : 0;
+
+    auto backend = config_.default_backend == Config::Backend::CPU
+        ? ComputeBackend::CPU
+        : ComputeBackend::NPU;
+
+    auto input_node_id = gb->input({seq_len}, Precision::FP32);
+    auto hidden = gb->embedding(embedding_node_id_, input_node_id);
+
+    static std::set<uint32_t> skip_layers = {};
+    for (uint32_t layer_idx = 0; layer_idx < config_.num_layers; layer_idx++) {
+        if (skip_layers.count(layer_idx)) {
+            continue;
+        }
+        hidden = build_transformer_block(gb, hidden, layer_idx, backend, use_cache, position_offset);
+    }
+
+    auto final_hidden = gb->rms_norm(hidden, weight_nodes_.output_norm_weight, config_.layer_norm_eps);
+
+    std::vector<float> input_data(seq_len);
+    for (size_t i = 0; i < seq_len; i++) {
+        input_data[i] = static_cast<float>(tokens[i]);
+    }
+    gb->set_input(input_node_id, input_data.data(), Precision::FP32);
+
+    return final_hidden;
+}
+
+}
+}