mitdbg · geoffxy · Jul 15, 2023 · Jul 14, 2023 · Jul 15, 2023 · Jul 15, 2023
diff --git a/src/brad/calibration/__init__.py b/src/brad/calibration/__init__.py
diff --git a/src/brad/calibration/load/__init__.py b/src/brad/calibration/load/__init__.py
diff --git a/src/brad/calibration/load/metrics.py b/src/brad/calibration/load/metrics.py
@@ -0,0 +1,74 @@
+CLOUDWATCH_LOAD_METRICS = [
+    ("CPUUtilization", "Minimum"),
+    ("CPUUtilization", "Maximum"),
+    ("CPUUtilization", "Average"),
+    ("ReadIOPS", "Minimum"),
+    ("ReadIOPS", "Maximum"),
+    ("ReadIOPS", "Average"),
+    ("WriteIOPS", "Minimum"),
+    ("WriteIOPS", "Maximum"),
+    ("WriteIOPS", "Average"),
+    ("ReadThroughput", "Minimum"),
+    ("ReadThroughput", "Maximum"),
+    ("ReadThroughput", "Average"),
+    ("WriteThroughput", "Minimum"),
+    ("WriteThroughput", "Maximum"),
+    ("WriteThroughput", "Average"),
+    ("ReadLatency", "Minimum"),
+    ("ReadLatency", "Maximum"),
+    ("ReadLatency", "Average"),
+    ("WriteLatency", "Minimum"),
+    ("WriteLatency", "Maximum"),
+    ("WriteLatency", "Average"),
+]
+
+PERF_INSIGHTS_LOAD_METRICS = [
+    "os.loadAverageMinute.one",
+    "os.loadAverageMinute.five",
+    "os.loadAverageMinute.fifteen",
+    "os.cpuUtilization.system",
+    "os.cpuUtilization.total",
+    "os.cpuUtilization.user",
+    "os.diskIO.avgQueueLen",
+    "os.diskIO.tps",
+    "os.diskIO.util",
+    "os.diskIO.readIOsPS",
+    "os.diskIO.readKbPS",
+    "os.diskIO.writeIOsPS",
+    "os.diskIO.writeKbPS",
+    "os.network.rx",
+    "os.network.tx",
+    "os.memory.active",
+    "os.memory.dirty",
+    "os.memory.free",
+    "os.memory.writeback",
+    "os.memory.total",
+    "os.tasks.blocked",
+    "os.tasks.running",
+    "os.tasks.sleeping",
+    "os.tasks.stopped",
+    "os.tasks.total",
+    "db.SQL.queries",
+    "db.SQL.total_query_time",
+    "db.SQL.tup_deleted",
+    "db.SQL.tup_fetched",
+    "db.SQL.tup_inserted",
+    "db.SQL.tup_returned",
+    "db.SQL.tup_updated",
+    "db.Transactions.active_transactions",
+    "db.Transactions.blocked_transactions",
+    "db.Transactions.duration_commits",
+    "db.Transactions.xact_commit",
+    "db.Transactions.xact_rollback",
+    # NOTE: Aurora has specific storage metrics (probably because they use a custom storage engine)
+    # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/USER_PerfInsights_Counters.html#USER_PerfInsights_Counters.Aurora_PostgreSQL
+    "os.diskIO.auroraStorage.auroraStorageBytesRx",
+    "os.diskIO.auroraStorage.auroraStorageBytesTx",
+    "os.diskIO.auroraStorage.diskQueueDepth",
+    "os.diskIO.auroraStorage.readThroughput",
+    "os.diskIO.auroraStorage.writeThroughput",
+    "os.diskIO.auroraStorage.readLatency",
+    "os.diskIO.auroraStorage.writeLatency",
+    "os.diskIO.auroraStorage.readIOsPS",
+    "os.diskIO.auroraStorage.writeIOsPS",
+]
diff --git a/src/brad/calibration/load/query_runner.py b/src/brad/calibration/load/query_runner.py
@@ -0,0 +1,154 @@
+import pathlib
+import random
+import sys
+import time
+import queue
+import multiprocessing as mp
+import io
+import signal
+from typing import Callable
+
+from brad.config.engine import Engine
+from brad.config.file import ConfigFile
+from brad.connection.connection import Cursor
+from brad.server.engine_connections import EngineConnections
+
+
+class Options:
+    def __init__(
+        self,
+        worker_idx: int,
+        output_file: pathlib.Path,
+        config: ConfigFile,
+        engine: Engine,
+        schema_name: str,
+    ) -> None:
+        self.worker_idx = worker_idx
+        self.config = config
+        self.engine = engine
+        self.schema_name = schema_name
+
+        # Set to True if running on Redshift
+        self.disable_redshift_cache = False
+        self.output_file = output_file
+
+        self.avg_gap_s = 1.0
+        self.std_gap_s = 0.5
+        self.seed = 42
+
+
+class Context:
+    def __init__(
+        self,
+        cursor: Cursor,
+        output_file: io.TextIOBase,
+        prng: random.Random,
+        options: Options,
+    ) -> None:
+        self.cursor = cursor
+        self.output_file = output_file
+        self.prng = prng
+        self.options = options
+
+
+RunQueryCallable = Callable[[Context], None]
+
+
+def run_specific_query_until_signalled(
+    query_idx: int,
+    query: str,
+    options: Options,
+    start_queue: mp.Queue,
+    stop_queue: mp.Queue,
+) -> None:
+    runner = get_run_specific_query(query_idx, query)
+    run_until_signalled(runner, options, start_queue, stop_queue)
+
+
+def run_until_signalled(
+    run_query: RunQueryCallable,
+    options: Options,
+    start_queue: mp.Queue,
+    stop_queue: mp.Queue,
+) -> None:
+    """
+    Meant to be launched as a subprocess with multiprocessing.
+    """
+
+    def noop_handler(_signal, _frame):
+        pass
+
+    signal.signal(signal.SIGINT, noop_handler)
+
+    ec = EngineConnections.connect_sync(
+        options.config,
+        options.schema_name,
+        autocommit=True,
+        specific_engines={options.engine},
+    )
+
+    try:
+        conn = ec.get_connection(options.engine)
+        cursor = conn.cursor_sync()
+
+        # Hacky way to disable the query cache when applicable.
+        if options.disable_redshift_cache:
+            print(
+                "Disabling Redshift result cache (client {})".format(options.worker_idx)
+            )
+            cursor.execute_sync("SET enable_result_cache_for_session = OFF;")
+
+        prng = random.Random(options.seed ^ options.worker_idx)
+
+        with open(options.output_file, "w", encoding="UTF-8") as file:
+            ctx = Context(cursor, file, prng, options)
+            print("query_idx,run_time_s", file=file, flush=True)
+
+            # Signal that we're ready to start and wait for the controller.
+            start_queue.put_nowait("")
+            _ = stop_queue.get()
+
+            while True:
+                run_query(ctx)
+                try:
+                    _ = stop_queue.get_nowait()
+                    break
+                except queue.Empty:
+                    pass
+    finally:
+        ec.close_sync()
+
+
+def get_run_specific_query(query_idx: int, query_str: str) -> RunQueryCallable:
+    """
+    Runs `query_str` with an optional delay.
+    """
+
+    def run_specific_query(ctx: Context) -> None:
+        wait_for_s = ctx.prng.gauss(ctx.options.avg_gap_s, ctx.options.std_gap_s)
+        if wait_for_s < 0.0:
+            wait_for_s = 0.0
+        time.sleep(wait_for_s)
+
+        try:
+            start = time.time()
+            ctx.cursor.execute_sync(query_str)
+            ctx.cursor.fetchall_sync()
+            end = time.time()
+            print(
+                "{},{}".format(query_idx, end - start),
+                file=ctx.output_file,
+                flush=True,
+            )
+
+        except Exception as ex:
+            print(
+                "Skipping query {} because of an error (potentially timeout)".format(
+                    query_idx
+                ),
+                file=sys.stderr,
+                flush=True,
+            )
+            print(ex, file=sys.stderr, flush=True)
+
+    return run_specific_query