diff --git a/operatorspy/tests/random_sample.py b/operatorspy/tests/random_sample.py
index 85a3c681..7f477f99 100644
--- a/operatorspy/tests/random_sample.py
+++ b/operatorspy/tests/random_sample.py
@@ -199,7 +199,7 @@ def test_musa(lib, test_cases):
         (16384, 0.15, 0, 1, 2.0),
         (32000, 0.08, 0.8, 50, 1.0),
         (32000, 0.08, 1.0, 25, 1.0),
-        # (119696, 0.01, 1.0, 100, 1.0),
+        (119696, 0.01, 1.0, 100, 1.0),
     ]
     
     args = get_args()
diff --git a/src/ops/random_sample/ascend/random_sample_kernel.cpp b/src/ops/random_sample/ascend/random_sample_kernel.cpp
index 18b482bc..cc689ae5 100644
--- a/src/ops/random_sample/ascend/random_sample_kernel.cpp
+++ b/src/ops/random_sample/ascend/random_sample_kernel.cpp
@@ -3,6 +3,8 @@
 
 using namespace AscendC;
 
+const int32_t BLOCK_LEN = 256;
+
 template<typename T>
 class KernelRandomSample {
 public:
@@ -14,20 +16,22 @@ class KernelRandomSample {
         topk = topk_;
         voc = voc_;
         topp = topp_;
-        temperature = temper_;
+        invTemperature = 1.0f / temper_;
         random = random_;
-        blockSize = 256 * 2;
+        negMax = 0.f;
+        sum = 0.f;
 
         // CumSumInfo
-        if (sizeof(T) == sizeof(float)) {
-            topkAligned = (topk + 7) / 8 * 8;
-            vocAligned = (voc + 7) / 8 * 8;
-        } else {
-            topkAligned = (topk + 15) / 16 * 16;
-            vocAligned = (voc + 15) / 16 * 16;
-        }
+        topkAligned = topk * sizeof(T) % 32 == 0
+                          ? topk
+                          : (topk * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+        vocAligned = voc * sizeof(T) % 32 == 0
+                         ? voc
+                         : (voc * sizeof(T) + 31) / 32 * 32 / sizeof(T);
         topkIdxAligned = (topk + 3) / 4 * 4;
 
+        bufferLen = topkAligned > BLOCK_LEN ? topkAligned : BLOCK_LEN;
+
         // Set Gm
         pGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(p), voc);
         topkGm.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(topkAddr), topk);
@@ -35,14 +39,13 @@ class KernelRandomSample {
         resGm.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t *>(res), 1);
 
         // Global input and output
-        pipe.InitBuffer(pQue, 1, vocAligned * sizeof(T));
         pipe.InitBuffer(topkQue, 1, topkAligned * sizeof(T));
         pipe.InitBuffer(topkIdxQue, 1, topkIdxAligned * sizeof(int64_t));
         pipe.InitBuffer(resQue, 1, 32);// 32 bytes for aligned
-
-        pipe.InitBuffer(softMaxBuf1, blockSize);
-        pipe.InitBuffer(softMaxBuf2, blockSize);
-        pipe.InitBuffer(softMaxBuf3, blockSize);
+        pipe.InitBuffer(inBuf, BLOCK_LEN * sizeof(T));
+        pipe.InitBuffer(tmpBuf1, bufferLen * sizeof(T));
+        pipe.InitBuffer(tmpBuf2, bufferLen * sizeof(T));
+        pipe.InitBuffer(tmpBuf3, bufferLen * sizeof(T));
         pipe.InitBuffer(softMaxOutBuf, topkAligned * sizeof(T));
 
         pipe.InitBuffer(inclusiveSumOutBuf, topkAligned * sizeof(T));
@@ -55,40 +58,12 @@ class KernelRandomSample {
 
 private:
     // Softmax
-    __aicore__ inline void SoftMax(LocalTensor<T> &valIn,
-                                   LocalTensor<T> &topkValIn,
+    __aicore__ inline void SoftMax(LocalTensor<T> &topkValIn,
                                    LocalTensor<T> &softMaxOut) {
-        int32_t repeatTimes = vocAligned * sizeof(T) / blockSize;
-        int32_t remainder = vocAligned * sizeof(T) % blockSize / sizeof(T);
-        int32_t tileLength = blockSize / sizeof(T);
-        float negMax = -static_cast<float>(topkValIn(0));
-        float invTemperature = 1.0f / temperature;
-        float sum = 0.f;
-        float sum_s = 0.f;
-        LocalTensor<T> tmpBuffer = softMaxBuf1.Get<T>();
-        LocalTensor<T> tmpBuffer2 = softMaxBuf2.Get<T>();
-        LocalTensor<T> tmpBuffer3 = softMaxBuf3.Get<T>();
-        for (int32_t i = 0; i < repeatTimes; i++) {
-            Adds(tmpBuffer, valIn[i * tileLength], static_cast<T>(negMax), tileLength);
-            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), tileLength);
-            Exp(tmpBuffer3, tmpBuffer2, tileLength);
-            sum_s = 0.f;
-            for (int j = 0; j < tileLength; ++j) {
-                sum_s += static_cast<float>(tmpBuffer3(j));
-            }
-            sum += sum_s;
-        }
-        if (remainder != 0) {
-            Adds(tmpBuffer, valIn[repeatTimes * tileLength], static_cast<T>(negMax), remainder);
-            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
-            Exp(tmpBuffer3, tmpBuffer2, remainder);
-            sum_s = 0.f;
-            for (int i = 0; i < remainder; ++i) {
-                sum_s += static_cast<float>(tmpBuffer3(i));
-            }
-            sum += sum_s;
-        }
         float invSum = 1.0f / sum;
+        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
         Adds(tmpBuffer, topkValIn, static_cast<T>(negMax), topk);
         Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), topk);
         Exp(tmpBuffer3, tmpBuffer2, topk);
@@ -131,27 +106,58 @@ class KernelRandomSample {
     }
 
     __aicore__ inline void CopyIn() {
-        LocalTensor<T> pLocal = pQue.AllocTensor<T>();
         LocalTensor<T> topkValLocal = topkQue.AllocTensor<T>();
         LocalTensor<int64_t> topkIdxLocal = topkIdxQue.AllocTensor<int64_t>();
-
-        DataCopy(pLocal, pGm, vocAligned);
         DataCopy(topkValLocal, topkGm, topkAligned);
         DataCopy(topkIdxLocal, topkIdxGm, topkIdxAligned);
+        // Get Max val of input
+        negMax = -static_cast<float>(topkValLocal(0));
+
+        // Copy in p and compute sum
+        int32_t repeatTimes = voc / BLOCK_LEN;
+        int32_t remainder = voc % BLOCK_LEN;
+        float sum_s = 0.f;
+        LocalTensor<T> inBuffer = inBuf.Get<T>();
+        LocalTensor<T> tmpBuffer = tmpBuf1.Get<T>();
+        LocalTensor<T> tmpBuffer2 = tmpBuf2.Get<T>();
+        LocalTensor<T> tmpBuffer3 = tmpBuf3.Get<T>();
+        for (int32_t i = 0; i < repeatTimes; i++) {
+            DataCopy(inBuffer, pGm[i * BLOCK_LEN], BLOCK_LEN);
+            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), BLOCK_LEN);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), BLOCK_LEN);
+            Exp(tmpBuffer3, tmpBuffer2, BLOCK_LEN);
+            sum_s = 0.f;
+            for (int j = 0; j < BLOCK_LEN; ++j) {
+                sum_s += static_cast<float>(tmpBuffer3(j));
+            }
+            sum += sum_s;
+        }
+        if (remainder != 0) {
+            int32_t remainderAligned = remainder * sizeof(T) % 32 == 0
+                                           ? remainder
+                                           : (remainder * sizeof(T) + 31) / 32 * 32 / sizeof(T);
+            DataCopy(inBuffer, pGm[repeatTimes * BLOCK_LEN], remainderAligned);
+            Adds(tmpBuffer, inBuffer, static_cast<T>(negMax), remainder);
+            Muls(tmpBuffer2, tmpBuffer, static_cast<T>(invTemperature), remainder);
+            Exp(tmpBuffer3, tmpBuffer2, remainder);
+            sum_s = 0.f;
+            for (int i = 0; i < remainder; ++i) {
+                sum_s += static_cast<float>(tmpBuffer3(i));
+            }
+            sum += sum_s;
+        }
 
-        pQue.EnQue(pLocal);
         topkQue.EnQue(topkValLocal);
         topkIdxQue.EnQue(topkIdxLocal);
     }
 
     __aicore__ inline void Compute() {
         // Get input data
-        LocalTensor<T> pLocal = pQue.DeQue<T>();
         LocalTensor<T> topkValLocal = topkQue.DeQue<T>();
 
         // SoftMax
         LocalTensor<T> softMaxOutLocal = softMaxOutBuf.Get<T>();
-        SoftMax(pLocal, topkValLocal, softMaxOutLocal);
+        SoftMax(topkValLocal, softMaxOutLocal);
 
         // InclusiveSum
         LocalTensor<T> inclusiveOutLocal = inclusiveSumOutBuf.Get<T>();
@@ -162,7 +168,6 @@ class KernelRandomSample {
         LocalTensor<int64_t> resultLocal = resQue.AllocTensor<int64_t>();
         RandomSample(inclusiveOutLocal, topkIdxLocal, resultLocal);
 
-        pQue.FreeTensor(pLocal);
         topkQue.FreeTensor(topkValLocal);
         topkIdxQue.FreeTensor(topkIdxLocal);
         resQue.EnQue(resultLocal);
@@ -181,14 +186,14 @@ class KernelRandomSample {
 
     TPipe pipe;
 
-    TQue<QuePosition::VECIN, 1> pQue;
     TQue<QuePosition::VECIN, 1> topkQue;
     TQue<QuePosition::VECIN, 1> topkIdxQue;
     TQue<QuePosition::VECOUT, 1> resQue;
 
-    TBuf<TPosition::VECCALC> softMaxBuf1;
-    TBuf<TPosition::VECCALC> softMaxBuf2;
-    TBuf<TPosition::VECCALC> softMaxBuf3;
+    TBuf<TPosition::VECCALC> inBuf;
+    TBuf<TPosition::VECCALC> tmpBuf1;
+    TBuf<TPosition::VECCALC> tmpBuf2;
+    TBuf<TPosition::VECCALC> tmpBuf3;
     TBuf<TPosition::VECCALC> softMaxOutBuf;
 
     TBuf<TPosition::VECCALC> inclusiveSumOutBuf;
@@ -197,13 +202,15 @@ class KernelRandomSample {
     int32_t topk;
     int32_t voc;
     float topp;
-    float temperature;
+    float invTemperature;
     float random;
+    float negMax;
+    float sum;
 
     int32_t topkAligned;
     int32_t topkIdxAligned;
     int32_t vocAligned;
-    int32_t blockSize;
+    int32_t bufferLen;
 };
 
 extern "C" __global__ __aicore__ void