intel · guangyey · Oct 24, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp
@@ -18,16 +18,9 @@ TORCH_IMPL_FUNC(nll_loss_forward_out_xpu)
  int64_t ignore_index,
  const Tensor& output,
  const Tensor& total_weight) {
+  const Tensor& weight = weight_opt.getTensorRef();
   xpu::nll_loss_forward_kernel(
-      self,
-      target,
-      ((weight_opt.has_value() && (*weight_opt).defined())
-           ? at::OptionalTensorRef(*weight_opt)
-           : at::OptionalTensorRef()),
-      reduction,
-      ignore_index,
-      output,
-      total_weight);
+      output, total_weight, self, target, weight, reduction, ignore_index);
 }
 
 TORCH_IMPL_FUNC(nll_loss_backward_out_xpu)
@@ -39,19 +32,18 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_xpu)
  int64_t ignore_index,
  const Tensor& total_weight,
  const Tensor& grad_input) {
+  const Tensor& weight = weight_opt.getTensorRef();
   grad_input.zero_();
   xpu::nll_loss_backward_kernel(
+      grad_input,
       grad_output,
       self,
       target,
-      ((weight_opt.has_value() && (*weight_opt).defined())
-           ? at::OptionalTensorRef(*weight_opt)
-           : at::OptionalTensorRef()),
-      reduction,
-      ignore_index,
       total_weight,
-      grad_input);
+      weight,
+      reduction,
+      ignore_index);
 }
 
 } // namespace native
-} // namespace at
+} // namespace at
diff --git a/src/ATen/native/xpu/sycl/KernelUtils.h b/src/ATen/native/xpu/sycl/KernelUtils.h
@@ -11,3 +11,22 @@
                   i = _i_n_d_e_x)
 
 #define XPU_KERNEL_LOOP(item, i, n) XPU_KERNEL_LOOP_TYPE(item, i, n, int)
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int SYCL_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_GROUPS(
+    const int64_t N,
+    const int64_t max_threads_per_group = SYCL_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(
+      N > 0, "XPU kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+
+  // Round up division for positive number that cannot cause integer overflow
+  auto group_num = (N - 1) / max_threads_per_group + 1;
+  TORCH_INTERNAL_ASSERT(
+      group_num <= max_int, "Can't schedule too many blocks on XPU device");
+
+  return static_cast<int>(group_num);
+}