From cf4d1de2a11316299e6819235215fdb23c748f5a Mon Sep 17 00:00:00 2001
From: VolodymyrBg <aqdrgg19@gmail.com>
Date: Mon, 3 Mar 2025 17:40:44 +0200
Subject: [PATCH] perf(pcs): Optimize polynomial evaluation with cache-friendly
 algorithm

---
 pcs/src/poly.rs | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/pcs/src/poly.rs b/pcs/src/poly.rs
index 2f95b7ebc..c60b40b57 100644
--- a/pcs/src/poly.rs
+++ b/pcs/src/poly.rs
@@ -168,12 +168,41 @@ where
 
     // Horner's method for polynomial evaluation with cost O(n).
     fn horner_evaluate(&self, point: &F) -> T {
-        self.coeffs
-            .iter()
-            .rfold(T::zero(), move |mut result, coeff| {
-                result *= *point;
-                result + *coeff
-            })
+        // For very small polynomials, use the original implementation
+        if self.coeffs.len() < 16 {
+            return self.coeffs
+                .iter()
+                .rfold(T::zero(), move |mut result, coeff| {
+                    result *= *point;
+                    result + *coeff
+                });
+        }
+
+        // For larger polynomials, use a more cache-friendly approach
+        // by processing chunks of the polynomial
+        let chunk_size = 8;
+        let mut results = Vec::with_capacity((self.coeffs.len() + chunk_size - 1) / chunk_size);
+        
+        // Process each chunk separately
+        for chunk in self.coeffs.chunks(chunk_size) {
+            let mut chunk_result = T::zero();
+            for coeff in chunk.iter().rev() {
+                chunk_result *= *point;
+                chunk_result = chunk_result + *coeff;
+            }
+            results.push(chunk_result);
+        }
+        
+        // Combine chunk results
+        let point_pow_chunk = point.pow([chunk_size as u64]);
+        let mut final_result = *results.last().unwrap_or(&T::zero());
+        
+        for &chunk_result in results.iter().rev().skip(1) {
+            final_result *= point_pow_chunk;
+            final_result = final_result + chunk_result;
+        }
+        
+        final_result
     }
 }