Skip to content

Commit a3d1d6d

Browse files
committed
test(backup): Measure read/write latency during back up
* 50% reads and 50% writes. * Compaction enabled. * 75%-85% CPU utilization * Expect up to 6-7ms P99 read latencies. * Expect not more than 10ms read latency during backup
1 parent 03cf4e5 commit a3d1d6d

File tree

3 files changed

+48
-20
lines changed

3 files changed

+48
-20
lines changed

configurations/manager/100GB_dataset.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@ prepare_write_cmd: [ "cassandra-stress write cl=ALL n=26214400 -schema 'replicat
55
"cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
66
"cassandra-stress write cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=500 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
77

8-
stress_read_cmd: [ "cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=1..26214400",
9-
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=26214401..52428800",
10-
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=52428801..78643200",
11-
"cassandra-stress read cl=ALL n=26214400 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=50 -col 'size=FIXED(1024) n=FIXED(1)' -pop seq=78643201..104857600" ]
8+
stress_read_cmd: [ "cassandra-stress read cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'",
9+
"cassandra-stress write cl=ALL duration=10m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=100 -col 'size=FIXED(1024) n=FIXED(1)'", ]
1210

13-
instance_type_db: 'i3en.3xlarge'
11+
instance_type_db: 'i4i.4xlarge'
1412
instance_type_loader: 'c6i.xlarge'

mgmt_cli_test.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from textwrap import dedent
2626
from datetime import datetime, timedelta
2727
from dataclasses import dataclass
28+
from time import sleep
2829

2930
import boto3
3031
import yaml
@@ -1679,52 +1680,77 @@ def create_backup_and_report(self, mgr_cluster, label: str):
16791680
return task
16801681

16811682
def run_read_stress_and_report(self, label):
1682-
stress_queue = []
1683+
stress_queue = {"read": [], "write": []}
16831684

16841685
for command in self.params.get('stress_read_cmd'):
1685-
stress_queue.append(self.run_stress_thread(command, round_robin=True, stop_test_on_failure=False))
1686+
if " write " in command:
1687+
stress_queue["write"].append(self.run_stress_thread(command))
1688+
elif " read " in command:
1689+
stress_queue["read"].append(self.run_stress_thread(command))
1690+
else:
1691+
raise InvalidArgument("Unknown stress command")
1692+
1693+
def get_stress_averages(queue):
1694+
averages = {'op rate': 0.0, 'partition rate': 0.0, 'row rate': 0.0, 'latency 99th percentile': 0.0}
1695+
num_results = 0
1696+
for stress in queue:
1697+
results = self.get_stress_results(queue=stress)
1698+
num_results += len(results)
1699+
for result in results:
1700+
for key in averages:
1701+
averages[key] += float(result[key])
1702+
stats = {key: averages[key] / num_results for key in averages}
1703+
return stats
16861704

16871705
with ExecutionTimer() as stress_timer:
1688-
for stress in stress_queue:
1689-
assert self.verify_stress_thread(cs_thread_pool=stress), "Read stress command"
1706+
read_stats = get_stress_averages(stress_queue["read"])
1707+
write_stats = get_stress_averages(stress_queue["write"])
1708+
16901709
InfoEvent(message=f'Read stress duration: {stress_timer.duration}s.').publish()
16911710

16921711
read_stress_report = {
16931712
"read time": int(stress_timer.duration.total_seconds()),
1713+
"op rate": read_stats['op rate'],
1714+
"partition rate": read_stats['partition rate'],
1715+
"row rate": read_stats['row rate'],
1716+
"latency 99th percentile": read_stats['latency 99th percentile'],
1717+
}
1718+
self.report_to_argus(ManagerReportType.READ, read_stress_report, "Read stress: " + label)
1719+
1720+
write_stress_report = {
1721+
"read time": int(stress_timer.duration.total_seconds()),
1722+
"op rate": write_stats['op rate'],
1723+
"partition rate": write_stats['partition rate'],
1724+
"row rate": write_stats['row rate'],
1725+
"latency 99th percentile": write_stats['latency 99th percentile'],
16941726
}
1695-
self.report_to_argus(ManagerReportType.READ, read_stress_report, label)
1727+
self.report_to_argus(ManagerReportType.READ, write_stress_report, "Write stress: " + label)
16961728

16971729
def test_backup_benchmark(self):
16981730
self.log.info("Executing test_backup_restore_benchmark...")
16991731

17001732
self.log.info("Write data to table")
17011733
self.run_prepare_write_cmd()
17021734

1703-
self.log.info("Disable clusterwide compaction")
1704-
compaction_ops = CompactionOps(cluster=self.db_cluster)
1705-
# Disable keyspace autocompaction cluster-wide since we dont want it to interfere with our restore timing
1706-
for node in self.db_cluster.nodes:
1707-
compaction_ops.disable_autocompaction_on_ks_cf(node=node)
1708-
17091735
manager_tool = mgmt.get_scylla_manager_tool(manager_node=self.monitors.nodes[0])
17101736
mgr_cluster = self.ensure_and_get_cluster(manager_tool)
17111737

1738+
self.log.info("Run read test")
1739+
self.run_read_stress_and_report(" w/o concurrent backup")
1740+
17121741
self.log.info("Create and report backup time")
17131742
backup_task = self.create_backup_and_report(mgr_cluster, "Backup")
17141743

17151744
self.log.info("Remove backup")
17161745
backup_task.delete_backup_snapshot()
17171746

1718-
self.log.info("Run read test")
1719-
self.run_read_stress_and_report("Read stress")
1720-
17211747
self.log.info("Create and report backup time during read stress")
17221748

17231749
backup_thread = threading.Thread(target=self.create_backup_and_report,
17241750
kwargs={"mgr_cluster": mgr_cluster, "label": "Backup during read stress"})
17251751

17261752
read_stress_thread = threading.Thread(target=self.run_read_stress_and_report,
1727-
kwargs={"label": "Read stress during backup"})
1753+
kwargs={"label": " with concurrent backup"})
17281754
backup_thread.start()
17291755
read_stress_thread.start()
17301756

sdcm/argus_results.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ class Meta:
132132
description = "Read timing"
133133
Columns = [
134134
ColumnMetadata(name="read time", unit="s", type=ResultType.DURATION, higher_is_better=False),
135+
ColumnMetadata(name="op rate", unit="op/s", type=ResultType.FLOAT, higher_is_better=True),
136+
ColumnMetadata(name="partition rate", unit="pk/s", type=ResultType.FLOAT, higher_is_better=True),
137+
ColumnMetadata(name="row rate", unit="row/s", type=ResultType.FLOAT, higher_is_better=True),
138+
ColumnMetadata(name="latency 99th percentile", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
135139
]
136140

137141

0 commit comments

Comments
 (0)