paritytech · athei · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/prdoc/pr_7721.prdoc b/prdoc/pr_7721.prdoc
@@ -0,0 +1,16 @@
+title: 'revive: Rework the instruction benchmark'
+doc:
+- audience: Runtime Dev
+  description: |-
+    Fixes https://github.com/paritytech/polkadot-sdk/issues/6157
+
+    This fixes the last remaining benchmark that was not correct since it was too low level to be written in Rust. Instead, we opted.
+
+    This PR changes the benchmark that determines the scaling from `ref_time` to PolkaVM `Gas` by benchmarking the absolute worst case of an instruction: One that causes two cache misses by touching two cache lines.
+
+    The Contract itself is designed to be as simple as possible. It does random unaligned reads in a loop until the `r` (repetition) number is reached. The randomness is fully generated by the host and written to the guests memory before the benchmark is run. This allows the benchmark to determine the influence of one loop iteration via linear regression.
+crates:
+- name: pallet-revive
+  bump: patch
+- name: pallet-revive-fixtures
+  bump: major
diff --git a/substrate/frame/revive/Cargo.toml b/substrate/frame/revive/Cargo.toml
@@ -30,6 +30,8 @@ log = { workspace = true }
 paste = { workspace = true }
 polkavm = { version = "0.21.0", default-features = false }
 polkavm-common = { version = "0.21.0", default-features = false, optional = true }
+rand = { workspace = true, optional = true }
+rand_pcg = { workspace = true, optional = true }
 rlp = { workspace = true }
 scale-info = { features = ["derive"], workspace = true }
 serde = { features = [
@@ -96,6 +98,7 @@ std = [
 	"pallet-utility/std",
 	"polkavm-common?/std",
 	"polkavm/std",
+	"rand?/std",
 	"rlp/std",
 	"scale-info/std",
 	"secp256k1/std",
@@ -125,6 +128,8 @@ runtime-benchmarks = [
 	"pallet-transaction-payment/runtime-benchmarks",
 	"pallet-utility/runtime-benchmarks",
 	"polkavm-common/alloc",
+	"rand",
+	"rand_pcg",
 	"sp-consensus-aura",
 	"sp-consensus-babe",
 	"sp-consensus-slots",

diff --git a/substrate/frame/revive/fixtures/contracts/instr_benchmark.rs b/substrate/frame/revive/fixtures/contracts/instr_benchmark.rs
diff --git a/substrate/frame/revive/fixtures/src/lib.rs b/substrate/frame/revive/fixtures/src/lib.rs
@@ -40,7 +40,6 @@ pub mod bench {
 	use alloc::vec::Vec;
 	pub const DUMMY: &[u8] = fixture!("dummy");
 	pub const NOOP: &[u8] = fixture!("noop");
-	pub const INSTR: &[u8] = fixture!("instr_benchmark");
 
 	pub fn dummy_unique(replace_with: u32) -> Vec<u8> {
 		let mut dummy = DUMMY.to_vec();

diff --git a/substrate/frame/revive/src/benchmarking/call_builder.rs b/substrate/frame/revive/src/benchmarking/call_builder.rs
@@ -150,8 +150,11 @@ where
 		ext: &'a mut StackExt<'a, T>,
 		module: WasmBlob<T>,
 		input: Vec<u8>,
+		aux_data_size: u32,
 	) -> PreparedCall<'a, StackExt<'a, T>> {
-		module.prepare_call(Runtime::new(ext, input), ExportedFunction::Call).unwrap()
+		module
+			.prepare_call(Runtime::new(ext, input), ExportedFunction::Call, aux_data_size)
+			.unwrap()
 	}
 
 	/// Add transient_storage

diff --git a/substrate/frame/revive/src/benchmarking/code.rs b/substrate/frame/revive/src/benchmarking/code.rs
@@ -95,9 +95,28 @@ impl WasmModule {
 		Self::new(bench_fixtures::NOOP.to_vec())
 	}
 
-	/// A contract code that executes some ALU instructions in a loop.
-	pub fn instr() -> Self {
-		Self::new(bench_fixtures::INSTR.to_vec())
+	/// A contract code that does unaligned memory accessed in a loop.
+	pub fn instr(do_load: bool) -> Self {
+		let load = match do_load {
+			false => "",
+			true => "a0 = u64 [a0]",
+		};
+		let text = alloc::format!(
+			"
+		pub @deploy:
+		ret
+		pub @call:
+			@loop:
+				jump @done if t0 == a1
+				{load}
+				t0 = t0 + 1
+				jump @loop
+			@done:
+		ret
+		"
+		);
+		let code = polkavm_common::assembler::assemble(&text).unwrap();
+		Self::new(code)
 	}
 
 	fn new(code: Vec<u8>) -> Self {

diff --git a/substrate/frame/revive/src/benchmarking/mod.rs b/substrate/frame/revive/src/benchmarking/mod.rs
@@ -59,12 +59,6 @@ use sp_runtime::{
 /// but might make the results less precise.
 const API_BENCHMARK_RUNS: u32 = 1600;
 
-/// How many runs we do per instruction benchmark.
-///
-/// Same rationale as for [`API_BENCHMARK_RUNS`]. The number is bigger because instruction
-/// benchmarks are faster.
-const INSTR_BENCHMARK_RUNS: u32 = 5000;
-
 /// Number of layers in a Radix16 unbalanced trie.
 const UNBALANCED_TRIE_LAYERS: u32 = 20;
 
@@ -549,7 +543,7 @@ mod benchmarks {
 	fn noop_host_fn(r: Linear<0, API_BENCHMARK_RUNS>) {
 		let mut setup = CallSetup::<T>::new(WasmModule::noop());
 		let (mut ext, module) = setup.ext();
-		let prepared = CallSetup::<T>::prepare_call(&mut ext, module, r.encode());
+		let prepared = CallSetup::<T>::prepare_call(&mut ext, module, r.encode(), 0);
 		#[block]
 		{
 			prepared.call().unwrap();
@@ -2004,11 +1998,82 @@ mod benchmarks {
 	}
 
 	// Benchmark the execution of instructions.
+	//
+	// It benchmarks the absolute worst case by allocating a lot of memory
+	// and then accessing it so that each instruction generates two cache misses.
+	#[benchmark(pov_mode = Ignored)]
+	fn instr(r: Linear<0, 10_000>) {
+		use rand::{seq::SliceRandom, SeedableRng};
+		use rand_pcg::Pcg64;
+
+		// Ideally, this needs to be bigger than the cache.
+		const MEMORY_SIZE: u64 = sp_core::MAX_POSSIBLE_ALLOCATION as u64;
+
+		// This is benchmarked for x86-64.
+		const CACHE_LINE_SIZE: u64 = 64;
+
+		// An 8 byte load from this misalignment will reach into the subsequent line.
+		const MISALIGNMENT: u64 = 60;
+
+		// We only need one address per cache line.
+		// -1 because we skip the first address
+		const NUM_ADDRESSES: u64 = (MEMORY_SIZE - MISALIGNMENT) / CACHE_LINE_SIZE - 1;
+
+		assert!(
+			u64::from(r) <= NUM_ADDRESSES / 2,
+			"If we do too many iterations we run into the risk of loading from warm cache lines",
+		);
+
+		let mut setup = CallSetup::<T>::new(WasmModule::instr(true));
+		let (mut ext, module) = setup.ext();
+		let mut prepared =
+			CallSetup::<T>::prepare_call(&mut ext, module, Vec::new(), MEMORY_SIZE as u32);
+
+		assert!(
+			u64::from(prepared.aux_data_base()) & (CACHE_LINE_SIZE - 1) == 0,
+			"aux data base must be cache aligned"
+		);
+
+		// Addresses data will be located inside the aux data.
+		let misaligned_base = u64::from(prepared.aux_data_base()) + MISALIGNMENT;
+
+		// Create all possible addresses and shuffle them. This makes sure
+		// the accesses are random but no address is accessed more than once.
+		// we skip the first address since it is out entry point
-		// we skip the first address since it is out entry point
+		// we skip the first address since it is our entry point
-		// we skip the first address since it is out entry point
+		// we skip the first address since it is our entry point
+		let mut addresses = Vec::with_capacity(NUM_ADDRESSES as usize);
+		for i in 1..NUM_ADDRESSES {
+			let addr = (misaligned_base + i * CACHE_LINE_SIZE).to_le_bytes();
+			addresses.push(addr);
+		}
+		let mut rng = Pcg64::seed_from_u64(1337);
+		addresses.shuffle(&mut rng);
+
+		// The addresses need to be padded to be one cache line apart.
+		let mut memory = Vec::with_capacity((NUM_ADDRESSES * CACHE_LINE_SIZE) as usize);
+		for address in addresses {
+			memory.extend_from_slice(&address);
+			memory.resize(memory.len() + CACHE_LINE_SIZE as usize - 8, 0);
+		}
+
+		// Copies `memory` to `aux_data_base + MISALIGNMENT`.
+		// Sets `a0 = MISALIGNMENT` and `a1 = r`.
+		prepared
+			.setup_aux_data(memory.as_slice(), MISALIGNMENT as u32, r.into())
+			.unwrap();
+
+		#[block]
+		{
+			prepared.call().unwrap();
+		}
+	}
+
 	#[benchmark(pov_mode = Ignored)]
-	fn instr(r: Linear<0, INSTR_BENCHMARK_RUNS>) {
-		let mut setup = CallSetup::<T>::new(WasmModule::instr());
+	fn instr_empty_loop(r: Linear<0, 100_000>) {
+		let mut setup = CallSetup::<T>::new(WasmModule::instr(false));
 		let (mut ext, module) = setup.ext();
-		let prepared = CallSetup::<T>::prepare_call(&mut ext, module, r.encode());
+		let mut prepared = CallSetup::<T>::prepare_call(&mut ext, module, Vec::new(), 0);
+		prepared.setup_aux_data(&[], 0, r.into()).unwrap();
+
 		#[block]
 		{
 			prepared.call().unwrap();

diff --git a/substrate/frame/revive/src/gas.rs b/substrate/frame/revive/src/gas.rs
@@ -73,11 +73,12 @@ impl<T: Config> EngineMeter<T> {
 
 	/// How much ref time does each PolkaVM gas correspond to.
 	fn ref_time_per_fuel() -> u64 {
-		// We execute 6 different instructions therefore we have to divide the actual
-		// computed gas costs by 6 to have a rough estimate as to how expensive each
-		// single executed instruction is going to be.
-		let instr_cost = T::WeightInfo::instr(1).saturating_sub(T::WeightInfo::instr(0)).ref_time();
-		instr_cost / 6
+		let loop_iteration =
+			T::WeightInfo::instr(1).saturating_sub(T::WeightInfo::instr(0)).ref_time();
+		let empty_loop_iteration = T::WeightInfo::instr_empty_loop(1)
+			.saturating_sub(T::WeightInfo::instr_empty_loop(0))
+			.ref_time();
+		loop_iteration.saturating_sub(empty_loop_iteration)
 	}
 }
 

diff --git a/substrate/frame/revive/src/wasm/mod.rs b/substrate/frame/revive/src/wasm/mod.rs
@@ -307,13 +307,43 @@ where
 		let _ = self.runtime.ext().gas_meter_mut().sync_from_executor(self.instance.gas())?;
 		exec_result
 	}
+
+	/// The guest memory address at which the aux data is located.
+	#[cfg(feature = "runtime-benchmarks")]
+	pub fn aux_data_base(&self) -> u32 {
+		self.instance.module().memory_map().aux_data_address()
+	}
+
+	/// Copies `data` to the aux data at address `offset`.
+	///
+	/// It sets `a0` to the beginning of data inside the aux data.
+	/// It sets `a1` to the value passed.
+	///
+	/// Only used in benchmarking so far.
+	#[cfg(feature = "runtime-benchmarks")]
+	pub fn setup_aux_data(&mut self, data: &[u8], offset: u32, a1: u64) -> DispatchResult {
+		let a0 = self.aux_data_base().saturating_add(offset);
+		self.instance.write_memory(a0, data).map_err(|err| {
+			log::debug!(target: LOG_TARGET, "failed to write aux data: {err:?}");
+			Error::<E::T>::CodeRejected
+		})?;
+		self.instance.set_reg(polkavm::Reg::A0, a0.into());
+		self.instance.set_reg(polkavm::Reg::A1, a1);
+		Ok(())
+	}
 }
 
 impl<T: Config> WasmBlob<T> {
+	/// Compile and instantiate contract.
+	///
+	/// `aux_data_size` is only used for runtime benchmarks. Real contracts
+	/// don't make use of this buffer. Hence this should not be set to anything
+	/// other than `0` when not used for benchmarking.
 	pub fn prepare_call<E: Ext<T = T>>(
 		self,
 		mut runtime: Runtime<E, polkavm::RawInstance>,
 		entry_point: ExportedFunction,
+		aux_data_size: u32,
 	) -> Result<PreparedCall<E>, ExecError> {
 		let mut config = polkavm::Config::default();
 		config.set_backend(Some(polkavm::BackendKind::Interpreter));
@@ -332,6 +362,7 @@ impl<T: Config> WasmBlob<T> {
 		module_config.set_page_size(limits::PAGE_SIZE);
 		module_config.set_gas_metering(Some(polkavm::GasMeteringKind::Sync));
 		module_config.set_allow_sbrk(false);
+		module_config.set_aux_data_size(aux_data_size);
 		let module = polkavm::Module::new(&engine, &module_config, self.code.into_inner().into())
 			.map_err(|err| {
 			log::debug!(target: LOG_TARGET, "failed to create polkavm module: {err:?}");
@@ -375,7 +406,7 @@ where
 		function: ExportedFunction,
 		input_data: Vec<u8>,
 	) -> ExecResult {
-		let prepared_call = self.prepare_call(Runtime::new(ext, input_data), function)?;
+		let prepared_call = self.prepare_call(Runtime::new(ext, input_data), function, 0)?;
 		prepared_call.call()
 	}