test(backup): Measure read/write latency during back up

* 50% reads and 50% writes. * Compaction enabled. * 75%-85% CPU utilization * Expect up to 6-7ms P99 read latencies. * Expect not more than 10ms read latency during backup
scylladb · Dec 30, 2024 · bb7483e · bb7483e
1 parent 03cf4e5
commit bb7483e
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 29 deletions.
diff --git a/mgmt_cli_test.py b/mgmt_cli_test.py
@@ -28,7 +28,6 @@
 
 import boto3
 import yaml
-from docker.errors import InvalidArgument
 
 from invoke import exceptions
 
@@ -46,6 +45,7 @@
 from sdcm.nemesis import MgmtRepair
 from sdcm.utils.adaptive_timeouts import adaptive_timeout, Operations
 from sdcm.utils.common import reach_enospc_on_node, clean_enospc_on_node
+from sdcm.utils.decorators import latency_calculator_decorator
 from sdcm.utils.issues import SkipPerIssues
 from sdcm.utils.loader_utils import LoaderUtilsMixin
 from sdcm.utils.time_utils import ExecutionTimer
@@ -1656,7 +1656,7 @@ def report_to_argus(self, report_type: ManagerReportType, data: dict, label: str
             table = ManagerBackupBenchmarkResult(sut_timestamp=mgmt.get_scylla_manager_tool(
                 manager_node=self.monitors.nodes[0]).sctool.client_version_timestamp)
         else:
-            raise InvalidArgument("Unknown report type")
+            raise ValueError("Unknown report type")
 
         for key, value in data.items():
             table.add_result(column=key, value=value, row=label, status=Status.UNSET)
@@ -1678,53 +1678,35 @@ def create_backup_and_report(self, mgr_cluster, label: str):
         self.report_to_argus(ManagerReportType.BACKUP, backup_report, label)
         return task
 
-    def run_read_stress_and_report(self, label):
-        stress_queue = []
-
-        for command in self.params.get('stress_read_cmd'):
-            stress_queue.append(self.run_stress_thread(command, round_robin=True, stop_test_on_failure=False))
+    @latency_calculator_decorator
+    def mixed_latency_load(self):
+        stress_load = self.run_stress_thread(self.params.get('stress_cmd'))
+        self.get_stress_results(queue=stress_load)
 
-        with ExecutionTimer() as stress_timer:
-            for stress in stress_queue:
-                assert self.verify_stress_thread(cs_thread_pool=stress), "Read stress command"
-        InfoEvent(message=f'Read stress duration: {stress_timer.duration}s.').publish()
-
-        read_stress_report = {
-            "read time": int(stress_timer.duration.total_seconds()),
-        }
-        self.report_to_argus(ManagerReportType.READ, read_stress_report, label)
-
-    def test_backup_benchmark(self):
+    def test_backup_benchmark_mixed(self):
         self.log.info("Executing test_backup_restore_benchmark...")
 
         self.log.info("Write data to table")
         self.run_prepare_write_cmd()
 
-        self.log.info("Disable clusterwide compaction")
-        compaction_ops = CompactionOps(cluster=self.db_cluster)
-        #  Disable keyspace autocompaction cluster-wide since we dont want it to interfere with our restore timing
-        for node in self.db_cluster.nodes:
-            compaction_ops.disable_autocompaction_on_ks_cf(node=node)
-
         manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
         mgr_cluster = self.ensure_and_get_cluster(manager_tool)
 
+        self.log.info("Run read test")
+        self.mixed_latency_load()
+
         self.log.info("Create and report backup time")
         backup_task = self.create_backup_and_report(mgr_cluster, "Backup")
 
         self.log.info("Remove backup")
         backup_task.delete_backup_snapshot()
 
-        self.log.info("Run read test")
-        self.run_read_stress_and_report("Read stress")
-
         self.log.info("Create and report backup time during read stress")
 
         backup_thread = threading.Thread(target=self.create_backup_and_report,
                                          kwargs={"mgr_cluster": mgr_cluster, "label": "Backup during read stress"})
 
-        read_stress_thread = threading.Thread(target=self.run_read_stress_and_report,
-                                              kwargs={"label": "Read stress during backup"})
+        read_stress_thread = threading.Thread(target=self.mixed_latency_load)
         backup_thread.start()
         read_stress_thread.start()
 

diff --git a/test-cases/manager/manager-backup-restore-baseline.yaml b/test-cases/manager/manager-backup-restore-baseline.yaml
@@ -0,0 +1,27 @@
+test_duration: 120
+
+round_robin: true
+
+# 100GB dataset
+prepare_write_cmd: [ "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=1..26214400",
+                     "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=26214401..52428800",
+                     "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
+                     "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
+
+stress_cmd: "cassandra-stress mixed cl=QUORUM duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native  -rate 'threads=100 fixed=50000/s' -col 'size=FIXED(1024) n=FIXED(1)'"
+
+
+instance_type_db: 'i4i.4xlarge'
+instance_type_loader: 'c6i.xlarge'
+
+region_name: us-east-1
+n_db_nodes: 3
+n_loaders: 4
+n_monitor_nodes: 1
+
+post_behavior_db_nodes: "destroy"
+post_behavior_loader_nodes: "destroy"
+post_behavior_monitor_nodes: "destroy"
+
+user_prefix: 'manager-backup-restore-baseline'
+use_hdr_cs_histogram: true