From f003207316718536d62212e3dde40c4829de2550 Mon Sep 17 00:00:00 2001 From: Ernest Zaslavsky Date: Sun, 22 Dec 2024 11:20:34 +0200 Subject: [PATCH] test(backup): Measure read/write latency during back up * 50% reads and 50% writes. * Compaction enabled. * 75%-85% CPU utilization * Expect up to 6-7ms P99 read latencies. * Expect not more than 10ms read latency during backup --- configurations/manager/100GB_dataset.yaml | 8 ++-- mgmt_cli_test.py | 55 ++++++++++++++++------- sdcm/argus_results.py | 4 ++ 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/configurations/manager/100GB_dataset.yaml b/configurations/manager/100GB_dataset.yaml index c564d3a25e1..07590f7b162 100644 --- a/configurations/manager/100GB_dataset.yaml +++ b/configurations/manager/100GB_dataset.yaml @@ -5,10 +5,8 @@ prepare_write_cmd: [ "cassandra-stress write cl=ALL n=26214400 -schema 'replicat "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200", "cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ] -stress_read_cmd: [ "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=1..26214400", - "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=26214401..52428800", - "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200", - "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ] +stress_read_cmd: [ "cassandra-stress read cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'", + "cassandra-stress write cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'", ] -instance_type_db: 'i3en.3xlarge' +instance_type_db: 'i4i.4xlarge' instance_type_loader: 'c6i.xlarge' diff --git a/mgmt_cli_test.py b/mgmt_cli_test.py index 726bddfad24..fbd59e41b67 100644 --- a/mgmt_cli_test.py +++ b/mgmt_cli_test.py @@ -1679,20 +1679,51 @@ def create_backup_and_report(self, mgr_cluster, label: str): return task def run_read_stress_and_report(self, label): - stress_queue = [] + stress_queue = {"read": [], "write": []} for command in self.params.get('stress_read_cmd'): - stress_queue.append(self.run_stress_thread(command, round_robin=True, stop_test_on_failure=False)) + if " write " in command: + stress_queue["write"].append(self.run_stress_thread(command)) + elif " read " in command: + stress_queue["read"].append(self.run_stress_thread(command)) + else: + raise InvalidArgument("Unknown stress command") + + def get_stress_averages(queue): + averages = {'op rate': 0.0, 'partition rate': 0.0, 'row rate': 0.0, 'latency 99th percentile': 0.0} + num_results = 0 + for stress in queue: + results = self.get_stress_results(queue=stress) + num_results += len(results) + for result in results: + for key in averages: + averages[key] += float(result[key]) + stats = {key: averages[key] / num_results for key in averages} + return stats with ExecutionTimer() as stress_timer: - for stress in stress_queue: - assert self.verify_stress_thread(cs_thread_pool=stress), "Read stress command" + read_stats = get_stress_averages(stress_queue["read"]) + write_stats = get_stress_averages(stress_queue["write"]) + InfoEvent(message=f'Read stress duration: {stress_timer.duration}s.').publish() read_stress_report = { "read time": int(stress_timer.duration.total_seconds()), + "op rate": read_stats['op rate'], + "partition rate": read_stats['partition rate'], + "row rate": read_stats['row rate'], + "latency 99th percentile": read_stats['latency 99th percentile'], + } + self.report_to_argus(ManagerReportType.READ, read_stress_report, "Read stress: " + label) + + write_stress_report = { + "read time": int(stress_timer.duration.total_seconds()), + "op rate": write_stats['op rate'], + "partition rate": write_stats['partition rate'], + "row rate": write_stats['row rate'], + "latency 99th percentile": write_stats['latency 99th percentile'], } - self.report_to_argus(ManagerReportType.READ, read_stress_report, label) + self.report_to_argus(ManagerReportType.READ, write_stress_report, "Write stress: " + label) def test_backup_benchmark(self): self.log.info("Executing test_backup_restore_benchmark...") @@ -1700,31 +1731,25 @@ def test_backup_benchmark(self): self.log.info("Write data to table") self.run_prepare_write_cmd() - self.log.info("Disable clusterwide compaction") - compaction_ops = CompactionOps(cluster=self.db_cluster) - # Disable keyspace autocompaction cluster-wide since we dont want it to interfere with our restore timing - for node in self.db_cluster.nodes: - compaction_ops.disable_autocompaction_on_ks_cf(node=node) - manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0]) mgr_cluster = self.ensure_and_get_cluster(manager_tool) + self.log.info("Run read test") + self.run_read_stress_and_report(" w/o concurrent backup") + self.log.info("Create and report backup time") backup_task = self.create_backup_and_report(mgr_cluster, "Backup") self.log.info("Remove backup") backup_task.delete_backup_snapshot() - self.log.info("Run read test") - self.run_read_stress_and_report("Read stress") - self.log.info("Create and report backup time during read stress") backup_thread = threading.Thread(target=self.create_backup_and_report, kwargs={"mgr_cluster": mgr_cluster, "label": "Backup during read stress"}) read_stress_thread = threading.Thread(target=self.run_read_stress_and_report, - kwargs={"label": "Read stress during backup"}) + kwargs={"label": " with concurrent backup"}) backup_thread.start() read_stress_thread.start() diff --git a/sdcm/argus_results.py b/sdcm/argus_results.py index b265c5b686b..624814b92ee 100644 --- a/sdcm/argus_results.py +++ b/sdcm/argus_results.py @@ -132,6 +132,10 @@ class Meta: description = "Read timing" Columns = [ ColumnMetadata(name="read time", unit="s", type=ResultType.DURATION, higher_is_better=False), + ColumnMetadata(name="op rate", unit="op/s", type=ResultType.FLOAT, higher_is_better=True), + ColumnMetadata(name="partition rate", unit="pk/s", type=ResultType.FLOAT, higher_is_better=True), + ColumnMetadata(name="row rate", unit="row/s", type=ResultType.FLOAT, higher_is_better=True), + ColumnMetadata(name="latency 99th percentile", unit="ms", type=ResultType.FLOAT, higher_is_better=False), ]