Introduce seq_len as inference param, and improve warnings (#15716)

abhinaykukkadapu · facebook-github-bot · commit aa98ba07721b · 2025-11-12T18:07:45.000-08:00
Summary:

Changes:
1. add `--seq_len` param to llama script to distinguish max_seq_len which is compile time param
2. Add warnings in the runner when `seq_len` is clamped to `max_seq_len` to avoid silently clamping it.
3. Add warnings in the token generator when EOS is not reached due to insufficient seq_len or max_seq_len.

Differential Revision: D86696759
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -941,7 +941,13 @@ def post_process():
         with open(f"{args.artifact}/outputs/outputs.txt", "r") as f:
             outputs.append(f.read())
 
-    seq_len = args.max_seq_len
+    # Use --seq_len if provided (inference-only), otherwise fall back to --max_seq_len
+    seq_len = args.seq_len
+    if seq_len is None:
+        logging.info(
+            f"--seq_len not provided, using --max_seq_len ({args.max_seq_len}) as fallback"
+        )
+        seq_len = args.max_seq_len
     multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt])
     lookahead_args = " ".join(
         [
@@ -1170,11 +1176,18 @@ def _build_parser():
 
     parser.add_argument(
         "--max_seq_len",
-        help="This refers to maximum number of tokens that the model can process & consider at once to generate predictions/responses.",
+        help="[Compile-time] Maximum sequence length compiled into the model (sets buffer sizes and context_len). This is the hard limit for the model's context window.",
         default=512,
         type=int,
     )
 
+    parser.add_argument(
+        "--seq_len",
+        help="[Runtime] Maximum number of tokens to generate (prompt + output). If not specified, uses --max_seq_len. Will be clamped to compiled max_seq_len if exceeded.",
+        default=None,
+        type=int,
+    )
+
     parser.add_argument(
         "--prefill_ar_len",
         help="The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use this option to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor for hybrid and lookahead mode.",
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -376,7 +376,22 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.inference_start_ms = time_in_ms();
 
   int32_t seq_len = config.seq_len;
-  seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  if (seq_len > context_len_) {
+    ET_LOG(
+        Info,
+        "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.",
+        seq_len,
+        context_len_,
+        context_len_);
+    seq_len = context_len_;
+  } else if (seq_len <= 0) {
+    ET_LOG(
+        Info,
+        "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).",
+        seq_len,
+        context_len_);
+    seq_len = context_len_;
+  }
   int32_t n_bos = (cur_pos_ == 0) ? 1 : 0;
 
   // encode the (string) prompt into tokens sequence
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp
@@ -323,6 +323,30 @@ Result<int64_t> TokenGenerator<T>::generate(
       break;
     }
   }
+
+  // Check if generation was truncated due to seq_len limit (no EOS token)
+  if (eos_ids_->count(cur_token) == 0 && pos >= seq_len - 1) {
+    printf("\n");
+    ET_LOG(
+        Info,
+        "Warning: Generation stopped at seq_len limit (%d) without reaching EOS token. Response may be incomplete.",
+        seq_len);
+    if (seq_len >= metadata_.context_len) {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) already equals compiled max_seq_len (%d). Consider recompiling with larger --max_seq_len.",
+          seq_len,
+          metadata_.context_len);
+    } else {
+      ET_LOG(
+          Info,
+          "- seq_len (%d) is less than compiled max_seq_len (%d). Consider increasing --seq_len (up to %d).",
+          seq_len,
+          metadata_.context_len,
+          metadata_.context_len);
+    }
+  }
+
   return pos - start_pos;
 }
 // Explicit instantiations