test(backup): Measure read/write latency during back up

kreuzerkrieg · kreuzerkrieg · commit a3d1d6da8851 · 2024-12-23T13:30:58.000+02:00
* 50% reads and 50% writes.
* Compaction enabled.
* 75%-85% CPU utilization
* Expect up to 6-7ms P99 read latencies.
* Expect not more than 10ms read latency during backup
diff --git a/configurations/manager/100GB_dataset.yaml b/configurations/manager/100GB_dataset.yaml
@@ -5,10 +5,8 @@ prepare_write_cmd: [ "cassandra-stress write cl=ALL n=26214400 -schema 'replicat
                      "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
                      "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
 
-stress_read_cmd: [ "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=1..26214400",
-                   "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=26214401..52428800",
-                   "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
-                   "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
+stress_read_cmd: [ "cassandra-stress read  cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'",
+                   "cassandra-stress write cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'", ]
 
-instance_type_db: 'i3en.3xlarge'
+instance_type_db: 'i4i.4xlarge'
 instance_type_loader: 'c6i.xlarge'
diff --git a/mgmt_cli_test.py b/mgmt_cli_test.py
@@ -25,6 +25,7 @@
 from textwrap import dedent
 from datetime import datetime, timedelta
 from dataclasses import dataclass
+from time import sleep
 
 import boto3
 import yaml
@@ -1679,52 +1680,77 @@ def create_backup_and_report(self, mgr_cluster, label: str):
         return task
 
     def run_read_stress_and_report(self, label):
-        stress_queue = []
+        stress_queue = {"read": [], "write": []}
 
         for command in self.params.get('stress_read_cmd'):
-            stress_queue.append(self.run_stress_thread(command, round_robin=True, stop_test_on_failure=False))
+            if " write " in command:
+                stress_queue["write"].append(self.run_stress_thread(command))
+            elif " read " in command:
+                stress_queue["read"].append(self.run_stress_thread(command))
+            else:
+                raise InvalidArgument("Unknown stress command")
+
+        def get_stress_averages(queue):
+            averages = {'op rate': 0.0, 'partition rate': 0.0, 'row rate': 0.0, 'latency 99th percentile': 0.0}
+            num_results = 0
+            for stress in queue:
+                results = self.get_stress_results(queue=stress)
+                num_results += len(results)
+                for result in results:
+                    for key in averages:
+                        averages[key] += float(result[key])
+            stats = {key: averages[key] / num_results for key in averages}
+            return stats
 
         with ExecutionTimer() as stress_timer:
-            for stress in stress_queue:
-                assert self.verify_stress_thread(cs_thread_pool=stress), "Read stress command"
+            read_stats = get_stress_averages(stress_queue["read"])
+            write_stats = get_stress_averages(stress_queue["write"])
+
         InfoEvent(message=f'Read stress duration: {stress_timer.duration}s.').publish()
 
         read_stress_report = {
             "read time": int(stress_timer.duration.total_seconds()),
+            "op rate": read_stats['op rate'],
+            "partition rate": read_stats['partition rate'],
+            "row rate": read_stats['row rate'],
+            "latency 99th percentile": read_stats['latency 99th percentile'],
+        }
+        self.report_to_argus(ManagerReportType.READ, read_stress_report, "Read stress: " + label)
+
+        write_stress_report = {
+            "read time": int(stress_timer.duration.total_seconds()),
+            "op rate": write_stats['op rate'],
+            "partition rate": write_stats['partition rate'],
+            "row rate": write_stats['row rate'],
+            "latency 99th percentile": write_stats['latency 99th percentile'],
         }
-        self.report_to_argus(ManagerReportType.READ, read_stress_report, label)
+        self.report_to_argus(ManagerReportType.READ, write_stress_report, "Write stress: " + label)
 
     def test_backup_benchmark(self):
         self.log.info("Executing test_backup_restore_benchmark...")
 
         self.log.info("Write data to table")
         self.run_prepare_write_cmd()
 
-        self.log.info("Disable clusterwide compaction")
-        compaction_ops = CompactionOps(cluster=self.db_cluster)
-        #  Disable keyspace autocompaction cluster-wide since we dont want it to interfere with our restore timing
-        for node in self.db_cluster.nodes:
-            compaction_ops.disable_autocompaction_on_ks_cf(node=node)
-
         manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
         mgr_cluster = self.ensure_and_get_cluster(manager_tool)
 
+        self.log.info("Run read test")
+        self.run_read_stress_and_report(" w/o concurrent backup")
+
         self.log.info("Create and report backup time")
         backup_task = self.create_backup_and_report(mgr_cluster, "Backup")
 
         self.log.info("Remove backup")
         backup_task.delete_backup_snapshot()
 
-        self.log.info("Run read test")
-        self.run_read_stress_and_report("Read stress")
-
         self.log.info("Create and report backup time during read stress")
 
         backup_thread = threading.Thread(target=self.create_backup_and_report,
                                          kwargs={"mgr_cluster": mgr_cluster, "label": "Backup during read stress"})
 
         read_stress_thread = threading.Thread(target=self.run_read_stress_and_report,
-                                              kwargs={"label": "Read stress during backup"})
+                                              kwargs={"label": " with concurrent backup"})
         backup_thread.start()
         read_stress_thread.start()
 
diff --git a/sdcm/argus_results.py b/sdcm/argus_results.py
@@ -132,6 +132,10 @@ class Meta:
         description = "Read timing"
         Columns = [
             ColumnMetadata(name="read time", unit="s", type=ResultType.DURATION, higher_is_better=False),
+            ColumnMetadata(name="op rate", unit="op/s", type=ResultType.FLOAT, higher_is_better=True),
+            ColumnMetadata(name="partition rate", unit="pk/s", type=ResultType.FLOAT, higher_is_better=True),
+            ColumnMetadata(name="row rate", unit="row/s", type=ResultType.FLOAT, higher_is_better=True),
+            ColumnMetadata(name="latency 99th percentile", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
         ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,10 @@ class Meta:`
`132`	`132`	`description = "Read timing"`
`133`	`133`	`Columns = [`
`134`	`134`	`ColumnMetadata(name="read time", unit="s", type=ResultType.DURATION, higher_is_better=False),`
	`135`	`+ ColumnMetadata(name="op rate", unit="op/s", type=ResultType.FLOAT, higher_is_better=True),`
	`136`	`+ ColumnMetadata(name="partition rate", unit="pk/s", type=ResultType.FLOAT, higher_is_better=True),`
	`137`	`+ ColumnMetadata(name="row rate", unit="row/s", type=ResultType.FLOAT, higher_is_better=True),`
	`138`	`+ ColumnMetadata(name="latency 99th percentile", unit="ms", type=ResultType.FLOAT, higher_is_better=False),`
`135`	`139`	`]`
`136`	`140`
`137`	`141`