fix: avoid precision issues on vulkan backend (#980)

leejet · web-flow · commit f532972d6039 · 2025-11-16T20:57:08.000+08:00
diff --git a/common.hpp b/common.hpp
@@ -242,14 +242,18 @@ class FeedForward : public GGMLBlock {
         }
 
         // net_1 is nn.Dropout(), skip for inference
-        float scale = 1.f;
+        bool force_prec_f32 = false;
+        float scale         = 1.f;
         if (precision_fix) {
             scale = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
         }
         // The purpose of the scale here is to prevent NaN issues in certain situations.
         // For example, when using Vulkan without enabling force_prec_f32,
         // or when using CUDA but the weights are k-quants.
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
     }
 
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
diff --git a/qwen_image.hpp b/qwen_image.hpp
@@ -94,10 +94,14 @@ namespace Qwen {
             blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
             blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
 
-            float scale = 1.f / 32.f;
+            float scale         = 1.f / 32.f;
+            bool force_prec_f32 = false;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example when using CUDA but the weights are k-quants (not all prompts).
-            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, false, scale));
+            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
             // to_out.1 is nn.Dropout
 
             blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));

Original file line number	Diff line number	Diff line change
`@@ -242,14 +242,18 @@ class FeedForward : public GGMLBlock {`
`242`	`242`	`}`
`243`	`243`
`244`	`244`	`// net_1 is nn.Dropout(), skip for inference`
`245`		`- float scale = 1.f;`
	`245`	`+ bool force_prec_f32 = false;`
	`246`	`+ float scale = 1.f;`
`246`	`247`	`if (precision_fix) {`
`247`	`248`	`scale = 1.f / 128.f;`
	`249`	`+#ifdef SD_USE_VULKAN`
	`250`	`+ force_prec_f32 = true;`
	`251`	`+#endif`
`248`	`252`	`}`
`249`	`253`	`// The purpose of the scale here is to prevent NaN issues in certain situations.`
`250`	`254`	`// For example, when using Vulkan without enabling force_prec_f32,`
`251`	`255`	`// or when using CUDA but the weights are k-quants.`
`252`		`- blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));`
	`256`	`+ blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));`
`253`	`257`	`}`
`254`	`258`
`255`	`259`	`struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {`