@@ -488,14 +488,14 @@ struct CLIPLayer : public GGMLBlock {
488
488
blocks[" mlp" ] = std::shared_ptr<GGMLBlock>(new CLIPMLP (d_model, intermediate_size));
489
489
}
490
490
491
- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x, bool mask = true ) {
491
+ struct ggml_tensor * forward (struct ggml_context * ctx, ggml_backend_t backend, struct ggml_tensor * x, bool mask = true ) {
492
492
// x: [N, n_token, d_model]
493
493
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks[" self_attn" ]);
494
494
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm1" ]);
495
495
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm2" ]);
496
496
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks[" mlp" ]);
497
497
498
- x = ggml_add (ctx, x, self_attn->forward (ctx, layer_norm1->forward (ctx, x), mask));
498
+ x = ggml_add (ctx, x, self_attn->forward (ctx, backend, layer_norm1->forward (ctx, x), mask));
499
499
x = ggml_add (ctx, x, mlp->forward (ctx, layer_norm2->forward (ctx, x)));
500
500
return x;
501
501
}
@@ -517,7 +517,11 @@ struct CLIPEncoder : public GGMLBlock {
517
517
}
518
518
}
519
519
520
- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x, int clip_skip = -1 , bool mask = true ) {
520
+ struct ggml_tensor * forward (struct ggml_context * ctx,
521
+ ggml_backend_t backend,
522
+ struct ggml_tensor * x,
523
+ int clip_skip = -1 ,
524
+ bool mask = true ) {
521
525
// x: [N, n_token, d_model]
522
526
int layer_idx = n_layer - 1 ;
523
527
// LOG_DEBUG("clip_skip %d", clip_skip);
@@ -532,7 +536,7 @@ struct CLIPEncoder : public GGMLBlock {
532
536
}
533
537
std::string name = " layers." + std::to_string (i);
534
538
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
535
- x = layer->forward (ctx, x, mask); // [N, n_token, d_model]
539
+ x = layer->forward (ctx, backend, x, mask); // [N, n_token, d_model]
536
540
// LOG_DEBUG("layer %d", i);
537
541
}
538
542
return x;
@@ -712,6 +716,7 @@ class CLIPTextModel : public GGMLBlock {
712
716
}
713
717
714
718
struct ggml_tensor * forward (struct ggml_context * ctx,
719
+ ggml_backend_t backend,
715
720
struct ggml_tensor * input_ids,
716
721
struct ggml_tensor * tkn_embeddings,
717
722
size_t max_token_idx = 0 ,
@@ -722,7 +727,7 @@ class CLIPTextModel : public GGMLBlock {
722
727
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks[" final_layer_norm" ]);
723
728
724
729
auto x = embeddings->forward (ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
725
- x = encoder->forward (ctx, x, return_pooled ? -1 : clip_skip, true );
730
+ x = encoder->forward (ctx, backend, x, return_pooled ? -1 : clip_skip, true );
726
731
if (return_pooled || with_final_ln) {
727
732
x = final_layer_norm->forward (ctx, x);
728
733
}
@@ -775,6 +780,7 @@ class CLIPVisionModel : public GGMLBlock {
775
780
}
776
781
777
782
struct ggml_tensor * forward (struct ggml_context * ctx,
783
+ ggml_backend_t backend,
778
784
struct ggml_tensor * pixel_values,
779
785
bool return_pooled = true ,
780
786
int clip_skip = -1 ) {
@@ -786,7 +792,7 @@ class CLIPVisionModel : public GGMLBlock {
786
792
787
793
auto x = embeddings->forward (ctx, pixel_values); // [N, num_positions, embed_dim]
788
794
x = pre_layernorm->forward (ctx, x);
789
- x = encoder->forward (ctx, x, clip_skip, false );
795
+ x = encoder->forward (ctx, backend, x, clip_skip, false );
790
796
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
791
797
auto last_hidden_state = x;
792
798
x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
@@ -855,6 +861,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
855
861
}
856
862
857
863
struct ggml_tensor * forward (struct ggml_context * ctx,
864
+ ggml_backend_t backend,
858
865
struct ggml_tensor * pixel_values,
859
866
bool return_pooled = true ,
860
867
int clip_skip = -1 ) {
@@ -863,7 +870,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
863
870
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks[" vision_model" ]);
864
871
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks[" visual_projection" ]);
865
872
866
- auto x = vision_model->forward (ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
873
+ auto x = vision_model->forward (ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
867
874
868
875
if (return_pooled) {
869
876
x = visual_projection->forward (ctx, x); // [N, projection_dim]
@@ -900,6 +907,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
900
907
}
901
908
902
909
struct ggml_tensor * forward (struct ggml_context * ctx,
910
+ ggml_backend_t backend,
903
911
struct ggml_tensor * input_ids,
904
912
struct ggml_tensor * embeddings,
905
913
size_t max_token_idx = 0 ,
@@ -911,7 +919,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
911
919
input_ids = ggml_reshape_2d (ctx, input_ids, model.n_token , input_ids->ne [0 ] / model.n_token );
912
920
}
913
921
914
- return model.forward (ctx, input_ids, embeddings, max_token_idx, return_pooled);
922
+ return model.forward (ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
915
923
}
916
924
917
925
struct ggml_cgraph * build_graph (struct ggml_tensor * input_ids,
@@ -937,7 +945,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
937
945
embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
938
946
}
939
947
940
- struct ggml_tensor * hidden_states = forward (compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
948
+ struct ggml_tensor * hidden_states = forward (compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
941
949
942
950
ggml_build_forward_expand (gf, hidden_states);
943
951
0 commit comments