Skip to content

Commit

Permalink
test(backup): Measure read/write latency during back up
Browse files Browse the repository at this point in the history
* 50% reads and 50% writes.
* Compaction enabled.
* 75%-85% CPU utilization
* Expect up to 6-7ms P99 read latencies.
* Expect not more than 10ms read latency during backup
  • Loading branch information
kreuzerkrieg committed Dec 29, 2024
1 parent 03cf4e5 commit 22c48dc
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 23 deletions.
8 changes: 3 additions & 5 deletions configurations/manager/100GB_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@ prepare_write_cmd: [ "cassandra-stress write cl=ALL n=26214400 -schema 'replicat
"cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
"cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]

stress_read_cmd: [ "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=1..26214400",
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=26214401..52428800",
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
stress_cmd: [ "cassandra-stress read cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'",
"cassandra-stress write cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'", ]

instance_type_db: 'i3en.3xlarge'
instance_type_db: 'i4i.4xlarge'
instance_type_loader: 'c6i.xlarge'
60 changes: 42 additions & 18 deletions mgmt_cli_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

import boto3
import yaml
from docker.errors import InvalidArgument

from invoke import exceptions

Expand Down Expand Up @@ -1656,7 +1655,7 @@ def report_to_argus(self, report_type: ManagerReportType, data: dict, label: str
table = ManagerBackupBenchmarkResult(sut_timestamp=mgmt.get_scylla_manager_tool(
manager_node=self.monitors.nodes[0]).sctool.client_version_timestamp)
else:
raise InvalidArgument("Unknown report type")
raise ValueError("Unknown report type")

for key, value in data.items():
table.add_result(column=key, value=value, row=label, status=Status.UNSET)
Expand All @@ -1679,52 +1678,77 @@ def create_backup_and_report(self, mgr_cluster, label: str):
return task

def run_read_stress_and_report(self, label):
stress_queue = []
stress_queue = {"read": [], "write": []}

for command in self.params.get('stress_read_cmd'):
stress_queue.append(self.run_stress_thread(command, round_robin=True, stop_test_on_failure=False))
for command in self.params.get('stress_cmd'):
if " write " in command:
stress_queue["write"].append(self.run_stress_thread(command))
elif " read " in command:
stress_queue["read"].append(self.run_stress_thread(command))
else:
raise ValueError("Unknown stress command")

def get_stress_averages(queue):
averages = {'op rate': 0.0, 'partition rate': 0.0, 'row rate': 0.0, 'latency 99th percentile': 0.0}
num_results = 0
for stress in queue:
results = self.get_stress_results(queue=stress)
num_results += len(results)
for result in results:
for key in averages:
averages[key] += float(result[key])
stats = {key: averages[key] / num_results for key in averages}
return stats

with ExecutionTimer() as stress_timer:
for stress in stress_queue:
assert self.verify_stress_thread(cs_thread_pool=stress), "Read stress command"
read_stats = get_stress_averages(stress_queue["read"])
write_stats = get_stress_averages(stress_queue["write"])

InfoEvent(message=f'Read stress duration: {stress_timer.duration}s.').publish()

read_stress_report = {
"read time": int(stress_timer.duration.total_seconds()),
"op rate": read_stats['op rate'],
"partition rate": read_stats['partition rate'],
"row rate": read_stats['row rate'],
"latency 99th percentile": read_stats['latency 99th percentile'],
}
self.report_to_argus(ManagerReportType.READ, read_stress_report, "Read stress: " + label)

write_stress_report = {
"read time": int(stress_timer.duration.total_seconds()),
"op rate": write_stats['op rate'],
"partition rate": write_stats['partition rate'],
"row rate": write_stats['row rate'],
"latency 99th percentile": write_stats['latency 99th percentile'],
}
self.report_to_argus(ManagerReportType.READ, read_stress_report, label)
self.report_to_argus(ManagerReportType.READ, write_stress_report, "Write stress: " + label)

def test_backup_benchmark(self):
self.log.info("Executing test_backup_restore_benchmark...")

self.log.info("Write data to table")
self.run_prepare_write_cmd()

self.log.info("Disable clusterwide compaction")
compaction_ops = CompactionOps(cluster=self.db_cluster)
# Disable keyspace autocompaction cluster-wide since we dont want it to interfere with our restore timing
for node in self.db_cluster.nodes:
compaction_ops.disable_autocompaction_on_ks_cf(node=node)

manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
mgr_cluster = self.ensure_and_get_cluster(manager_tool)

self.log.info("Run read test")
self.run_read_stress_and_report(" w/o concurrent backup")

self.log.info("Create and report backup time")
backup_task = self.create_backup_and_report(mgr_cluster, "Backup")

self.log.info("Remove backup")
backup_task.delete_backup_snapshot()

self.log.info("Run read test")
self.run_read_stress_and_report("Read stress")

self.log.info("Create and report backup time during read stress")

backup_thread = threading.Thread(target=self.create_backup_and_report,
kwargs={"mgr_cluster": mgr_cluster, "label": "Backup during read stress"})

read_stress_thread = threading.Thread(target=self.run_read_stress_and_report,
kwargs={"label": "Read stress during backup"})
kwargs={"label": " with concurrent backup"})
backup_thread.start()
read_stress_thread.start()

Expand Down
4 changes: 4 additions & 0 deletions sdcm/argus_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ class Meta:
description = "Read timing"
Columns = [
ColumnMetadata(name="read time", unit="s", type=ResultType.DURATION, higher_is_better=False),
ColumnMetadata(name="op rate", unit="op/s", type=ResultType.FLOAT, higher_is_better=True),
ColumnMetadata(name="partition rate", unit="pk/s", type=ResultType.FLOAT, higher_is_better=True),
ColumnMetadata(name="row rate", unit="row/s", type=ResultType.FLOAT, higher_is_better=True),
ColumnMetadata(name="latency 99th percentile", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
]


Expand Down

0 comments on commit 22c48dc

Please sign in to comment.