Add README

am17an · am17an · commit e4b73466eded · 2025-07-19T22:41:14.000+08:00
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
-MIT License
+The MIT License (MIT)
 
-Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2018 Max Strübing
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3480,7 +3480,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) { params.diffusion_llada.cfg_scale = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA }));
     add_opt(common_arg(
-        { "--diffusion-remasking-alg" }, "N",
+        { "--diffusion-alg" }, "N",
         string_format("remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion_llada.remasking),
         [](common_params & params, int value) { params.diffusion_llada.remasking = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA }));
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2946,15 +2946,15 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
         # Set context length for LLaDA
-        context_length = self.hparams.get("max_sequence_length")
+        context_length = self.hparams.get("max_sequence_length", 4096)
         self.gguf_writer.add_context_length(context_length)
 
         # Set embedding length (dimension size)
-        embedding_length = self.hparams.get("d_model")
+        embedding_length = self.hparams.get("d_model", 4096)
         self.gguf_writer.add_embedding_length(embedding_length)
 
         # Set feed forward length (MLP hidden size)
-        feed_forward_length = self.hparams.get("mlp_hidden_size")
+        feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
         self.gguf_writer.add_feed_forward_length(feed_forward_length)
 
         # Set RoPE parameters
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
@@ -0,0 +1,39 @@
+# Diffusion Text Generation Examples
+
+This directory contains implementations for diffusion-based text generation using two different model architectures: **Dream** and **LLaDA-8B**. Both models use iterative denoising processes to generate text, but employ different sampling strategies and algorithms.
+
+## Supported Models
+
+### 1. Dream Model (`llama-diffusion-dream-cli`)
+
+- https://huggingface.co/Dream-org/Dream-v0-Base-7B
+- Original PR - https://github.com/ggml-org/llama.cpp/pull/14644
+
+The Dream model supports four different sampling algorithms controlled by the `--diffusion-alg` parameter:
+
+1. **ORIGIN (0)** - Original diffusion algorithm
+   - Uses probability transfer based on timestep ratios
+   - Default algorithm with standard confidence-based token selection
+
+2. **MASKGIT_PLUS (1)** - Enhanced MaskGIT sampling
+   - Improved version of the MaskGIT algorithm
+
+3. **TOPK_MARGIN (2)** - Top-K margin-based sampling
+   - Confidence calculated as the margin between top-1 and top-2 probabilities
+
+4. **ENTROPY (3)** - Entropy-based sampling (recommended)
+   - Uses entropy calculation for confidence estimation
+
+### 2. LLaDA-8B Model (`llama-diffusion-llada-cli`)
+
+- https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct
+
+### LLaDA Model Remasking Strategies
+
+The LLaDA model uses two remasking approaches controlled by the `--diffusion-alg` parameter:
+
+1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy
+   - Remasks tokens with lowest confidence scores
+   - Uses softmax probabilities to determine confidence
+
+2. **REMASKING_RANDOM (1)** - Random remasking
diff --git a/examples/diffusion/diffusion-llada-cli.cpp b/examples/diffusion/diffusion-llada-cli.cpp
@@ -489,7 +489,7 @@ int main(int argc, char ** argv) {
             //clear screen and move cursor to top-left
             LOG_INF("\033[2J\033[H");
         }
-        
+
         output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
         std::string output_data = common_detokenize(vocab, output_tokens, false);
         LOG_INF("\n%s\n", output_data.c_str());
@@ -502,4 +502,4 @@ int main(int argc, char ** argv) {
     llama_backend_free();
 
     return 0;
-}
+}
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -8024,7 +8024,7 @@ struct llm_build_dream : public llm_graph_context {
 };
 
 struct llm_build_llada : public llm_graph_context {
-    llm_build_llada(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) :
+    llm_build_llada(const llama_model & model, const llm_graph_params & params) :
         llm_graph_context(params) {
         // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -8077,7 +8077,7 @@ struct llm_build_llada : public llm_graph_context {
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
-                cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
+                cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
                                  1.0f / sqrtf(float(n_embd_head)), il);
             }
 
@@ -17337,7 +17337,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             break;
         case LLM_ARCH_LLADA:
             {
-                llm = std::make_unique<llm_build_llada>(*this, params, gf);
+                llm = std::make_unique<llm_build_llada>(*this, params);
             }
             break;
         case LLM_ARCH_QWEN2VL: