Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cactus/engine/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ struct Config {
float max_pixels_tolerance = 2.0f;
bool do_image_splitting = true;

enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7};
enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, PHI3 = 8};
ModelType model_type = ModelType::QWEN;

enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
Expand Down Expand Up @@ -156,7 +156,7 @@ class Tokenizer {
void set_corpus_dir(const std::string& dir) { corpus_dir_ = dir; }

protected:
enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT, WHISPER};
enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT, WHISPER, PHI3};
ModelType model_type_ = ModelType::UNKNOWN;
enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
ModelVariant model_variant_ = ModelVariant::DEFAULT;
Expand All @@ -174,6 +174,7 @@ class Tokenizer {
std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
std::string format_smol_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
std::string format_phi3_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
};

class BPETokenizer : public Tokenizer {
Expand Down
3 changes: 3 additions & 0 deletions cactus/engine/engine_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ bool Config::from_json(const std::string& config_path) {
if (value == "gemma" || value == "GEMMA") model_type = ModelType::GEMMA;
else if (value == "lfm2" || value == "LFM2") model_type = ModelType::LFM2;
else if (value == "smol" || value == "SMOL" || value == "Smol") model_type = ModelType::SMOL;
else if (value == "phi3" || value == "PHI3" || value == "Phi3") model_type = ModelType::PHI3;
else if (value == "bert" || value == "BERT") model_type = ModelType::NOMIC;
else if (value == "whisper" || value == "WHISPER") model_type = ModelType::WHISPER;
else model_type = ModelType::QWEN;
Expand Down Expand Up @@ -477,6 +478,8 @@ std::unique_ptr<Model> create_model(const std::string& model_folder) {
return std::make_unique<LFM2Model>(config);
case Config::ModelType::SMOL:
return std::make_unique<SmolModel>(config);
case Config::ModelType::PHI3:
return std::make_unique<Phi3Model>(config);
case Config::ModelType::NOMIC:
return std::make_unique<NomicModel>(config);
case Config::ModelType::WHISPER:
Expand Down
11 changes: 10 additions & 1 deletion cactus/engine/engine_sp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ std::string SPTokenizer::preprocess_text(const std::string& text) const {

for (size_t i = text.find_first_not_of(" "); i < text.length(); i++) {
char c = text[i];
if (c == ' ') {
// Phi-3 treats newlines like spaces in SentencePiece encoding
if (c == ' ' || (c == '\n' && model_type_ == ModelType::PHI3)) {
processed += "▁";
} else {
processed += c;
Expand Down Expand Up @@ -383,6 +384,14 @@ std::vector<std::string> SPTokenizer::split_with_special_tokens(const std::strin
}
result.push_back(best_special_token);
start = best_match_pos + best_match_len;
// Phi-3 specific: rstrip whitespace after special tokens (except <|endoftext|>)
// This matches `rstrip=True` behavior for Phi-3 chat tokens
if (model_type_ == ModelType::PHI3 && best_special_token != "<|endoftext|>") {
while (start < text.size() && (text[start] == ' ' || text[start] == '\n' ||
text[start] == '\t' || text[start] == '\r')) {
start++;
}
}
} else {
if (start < text.size()) {
result.push_back(text.substr(start));
Expand Down
31 changes: 31 additions & 0 deletions cactus/engine/engine_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ void Tokenizer::detect_model_type(const std::string& config_path) {
} else if (line.find("whisper") != std::string::npos) {
model_type_ = ModelType::WHISPER;
break;
} else if (line.find("phi3") != std::string::npos || line.find("phi-3") != std::string::npos) {
model_type_ = ModelType::PHI3;
break;
} else {
model_type_ = ModelType::UNKNOWN;
}
Expand Down Expand Up @@ -93,6 +96,8 @@ std::string Tokenizer::format_chat_prompt(const std::vector<ChatMessage>& messag
return format_lfm2_style(messages, add_generation_prompt, tools_json);
case ModelType::SMOL:
return format_smol_style(messages, add_generation_prompt, tools_json);
case ModelType::PHI3:
return format_phi3_style(messages, add_generation_prompt, tools_json);
default:
return format_qwen_style(messages, add_generation_prompt, tools_json);
}
Expand Down Expand Up @@ -413,6 +418,32 @@ std::string Tokenizer::format_smol_style(const std::vector<ChatMessage>& message
return result;
}

std::string Tokenizer::format_phi3_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const {
if (!tools_json.empty()) {
return "ERROR: Tool calls are currently not supported for Phi-3 models";
}

std::string result;

for (const auto& msg : messages) {
if (msg.role == "system") {
result += "<|system|>\n";
} else if (msg.role == "user") {
result += "<|user|>\n";
} else if (msg.role == "assistant") {
result += "<|assistant|>\n";
}
result += msg.content;
result += "<|end|>\n";
}

if (add_generation_prompt) {
result += "<|assistant|>\n";
}

return result;
}


} // namespace engine
} // namespace cactus
41 changes: 41 additions & 0 deletions cactus/models/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,47 @@ class SmolModel : public Model{
};


class Phi3Model : public Model {
public:
Phi3Model();
explicit Phi3Model(const Config& config);
~Phi3Model() override = default;

protected:
size_t build_attention(CactusGraph* gb, size_t normalized_input, uint32_t layer_idx,
ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) override;

size_t build_mlp(CactusGraph* gb, size_t normalized_h, uint32_t layer_idx,
ComputeBackend backend) const override;

size_t build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
ComputeBackend backend, bool use_cache = false, size_t position_offset = 0) override;

size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) override;
void load_weights_to_graph(CactusGraph* gb) override;

private:
struct WeightNodeIDs {
size_t output_weight;
size_t output_norm_weight;

struct LayerWeights {
size_t attn_q_weight;
size_t attn_k_weight;
size_t attn_v_weight;
size_t attn_output_weight;
size_t input_layernorm_weight;
size_t ffn_gate_weight;
size_t ffn_up_weight;
size_t ffn_down_weight;
size_t post_attention_layernorm_weight;
};

std::vector<LayerWeights> layers;
} weight_nodes_;
};


class Siglip2VisionModel : public Model {
friend class Lfm2VlModel;

Expand Down
165 changes: 165 additions & 0 deletions cactus/models/model_phi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#include "model.h"
#include "../graph/graph.h"
#include <cmath>
#include <stdexcept>
#include <set>

namespace cactus {
namespace engine {

Phi3Model::Phi3Model() : Model() {}

Phi3Model::Phi3Model(const Config& config) : Model(config) {
weight_nodes_.layers.resize(config.num_layers);
}

void Phi3Model::load_weights_to_graph(CactusGraph* gb) {
embedding_node_id_ = gb->mmap_embeddings(embedding_file_path_);
weight_nodes_.output_norm_weight = gb->mmap_weights(model_folder_path_ + "/output_norm.weights");

if (config_.tie_word_embeddings) {
weight_nodes_.output_weight = embedding_node_id_;
output_weight_node_id_ = embedding_node_id_;
} else {
weight_nodes_.output_weight = gb->mmap_weights(model_folder_path_ + "/output_weight.weights");
output_weight_node_id_ = weight_nodes_.output_weight;
}

for (uint32_t i = 0; i < config_.num_layers; i++) {
auto& layer = weight_nodes_.layers[i];
std::string layer_prefix = model_folder_path_ + "/layer_" + std::to_string(i) + "_";
layer.attn_q_weight = gb->mmap_weights(layer_prefix + "attn_q.weights");
layer.attn_k_weight = gb->mmap_weights(layer_prefix + "attn_k.weights");
layer.attn_v_weight = gb->mmap_weights(layer_prefix + "attn_v.weights");
layer.attn_output_weight = gb->mmap_weights(layer_prefix + "attn_output.weights");
layer.input_layernorm_weight = gb->mmap_weights(layer_prefix + "input_norm.weights");
layer.ffn_gate_weight = gb->mmap_weights(layer_prefix + "ffn_gate.weights");
layer.ffn_up_weight = gb->mmap_weights(layer_prefix + "ffn_up.weights");
layer.ffn_down_weight = gb->mmap_weights(layer_prefix + "ffn_down.weights");
layer.post_attention_layernorm_weight = gb->mmap_weights(layer_prefix + "post_attn_norm.weights");
}
}

size_t Phi3Model::build_attention(CactusGraph* gb, size_t normalized_input, uint32_t layer_idx,
ComputeBackend backend, bool use_cache, size_t position_offset) {
const auto& layer = weight_nodes_.layers[layer_idx];

auto q_proj = gb->matmul(normalized_input, layer.attn_q_weight, true, backend);
auto k_proj = gb->matmul(normalized_input, layer.attn_k_weight, true, backend);
auto v_proj = gb->matmul(normalized_input, layer.attn_v_weight, true, backend);

const auto& q_shape = gb->get_output_buffer(q_proj).shape;
size_t seq_len = q_shape[0];

auto q_proj_4d = gb->reshape(q_proj, {1, seq_len, config_.attention_heads, config_.attention_head_dim});
auto k_proj_4d = gb->reshape(k_proj, {1, seq_len, config_.attention_kv_heads, config_.attention_head_dim});
auto v_proj_4d = gb->reshape(v_proj, {1, seq_len, config_.attention_kv_heads, config_.attention_head_dim});

if (config_.rope_theta > 0) {
q_proj_4d = gb->rope(q_proj_4d, config_.rope_theta, position_offset);
k_proj_4d = gb->rope(k_proj_4d, config_.rope_theta, position_offset);
}

size_t final_k = k_proj_4d;
size_t final_v = v_proj_4d;

if (use_cache && !kv_cache_.is_empty()) {
auto k_view = kv_cache_.get_key_view(layer_idx);
auto v_view = kv_cache_.get_value_view(layer_idx);

if (k_view.ptr2 == nullptr && v_view.ptr2 == nullptr) {
size_t cache_k_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
size_t cache_v_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);

gb->set_input(cache_k_node, k_view.ptr1, kv_cache_.precision);
gb->set_input(cache_v_node, v_view.ptr1, kv_cache_.precision);

final_k = gb->concat(cache_k_node, k_proj_4d, 1);
final_v = gb->concat(cache_v_node, v_proj_4d, 1);
} else {
size_t cache_k_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);
size_t cache_v_node = gb->input({1, kv_cache_.current_seq_len, config_.attention_kv_heads, config_.attention_head_dim}, kv_cache_.precision);

gb->set_input(cache_k_node, kv_cache_.get_key_ptr(layer_idx), kv_cache_.precision);
gb->set_input(cache_v_node, kv_cache_.get_value_ptr(layer_idx), kv_cache_.precision);

final_k = gb->concat(cache_k_node, k_proj_4d, 1);
final_v = gb->concat(cache_v_node, v_proj_4d, 1);
}
}

if (use_cache) {
cache_k_output_nodes_[layer_idx] = final_k;
cache_v_output_nodes_[layer_idx] = final_v;
}

auto attn_output_4d = gb->attention(q_proj_4d, final_k, final_v, attention_scale_, position_offset);
auto attn_output = gb->reshape(attn_output_4d, {seq_len, config_.attention_head_dim * config_.attention_heads});
return gb->matmul(attn_output, layer.attn_output_weight, true, backend);
}

size_t Phi3Model::build_mlp(CactusGraph* gb, size_t normalized_h, uint32_t layer_idx,
ComputeBackend backend) const {
const auto& layer = weight_nodes_.layers[layer_idx];
size_t gate_output = gb->matmul(normalized_h, layer.ffn_gate_weight, true, backend);
size_t up_output = gb->matmul(normalized_h, layer.ffn_up_weight, true, backend);
size_t gate_silu = gb->silu(gate_output);
size_t gated = gb->multiply(gate_silu, up_output);
return gb->matmul(gated, layer.ffn_down_weight, true, backend);
}

size_t Phi3Model::build_transformer_block(CactusGraph* gb, size_t hidden, uint32_t layer_idx,
ComputeBackend backend, bool use_cache, size_t position_offset) {
const auto& layer = weight_nodes_.layers[layer_idx];
auto normalized_input = gb->rms_norm(hidden, layer.input_layernorm_weight, config_.layer_norm_eps);
auto attn_output = build_attention(gb, normalized_input, layer_idx, backend, use_cache, position_offset);
auto after_attention = gb->add(hidden, attn_output);
auto normalized_after_attention = gb->rms_norm(after_attention, layer.post_attention_layernorm_weight, config_.layer_norm_eps);
auto mlp_output = build_mlp(gb, normalized_after_attention, layer_idx, backend);
return gb->add(after_attention, mlp_output);
}

size_t Phi3Model::forward(const std::vector<uint32_t>& tokens, bool use_cache) {
if (!initialized_ || !graph_handle_) {
throw std::runtime_error("Model not initialized - call init() first");
}

if (tokens.empty()) {
throw std::runtime_error("Token sequence cannot be empty");
}

auto* gb = static_cast<CactusGraph*>(graph_handle_);
gb->soft_reset();

auto seq_len = static_cast<size_t>(tokens.size());

size_t position_offset = use_cache ? kv_cache_.get_total_seq_len() : 0;

auto backend = config_.default_backend == Config::Backend::CPU
? ComputeBackend::CPU
: ComputeBackend::NPU;

auto input_node_id = gb->input({seq_len}, Precision::FP32);
auto hidden = gb->embedding(embedding_node_id_, input_node_id);

static std::set<uint32_t> skip_layers = {};
for (uint32_t layer_idx = 0; layer_idx < config_.num_layers; layer_idx++) {
if (skip_layers.count(layer_idx)) {
continue;
}
hidden = build_transformer_block(gb, hidden, layer_idx, backend, use_cache, position_offset);
}

auto final_hidden = gb->rms_norm(hidden, weight_nodes_.output_norm_weight, config_.layer_norm_eps);

std::vector<float> input_data(seq_len);
for (size_t i = 0; i < seq_len; i++) {
input_data[i] = static_cast<float>(tokens[i]);
}
gb->set_input(input_node_id, input_data.data(), Precision::FP32);

return final_hidden;
}

}
}
Loading
Loading