From e77dcd04805408499078a656324a144ccd907e95 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 28 Aug 2024 16:57:34 +0300 Subject: [PATCH 1/7] core: Introduce binary tracer The goal of the tracer is to capture events at the frequence that exceeds the regular logging and metrics resolution. Basically, it's a buffer (well -- a bunch of buffers) into which pre-defined trace points but their binary data. Overhead: when OFF the it's is single if, when ON it's putting several bytes-long data, advancing a pointer and checking if the buffer is full. Current implementation pre-allocates 8 buffers 128k in size. Spoiler: io events take 25 bytes per request. At 10 kiops this size is able to store trace data for ~4 seconds. When a buffer is full it's flushed on disk in a single write, then re-cycled. If an event happens when there are no buffers available, it counts the number of skipped events and puts them into the next available buffer to at least let user know it happened. Tracing is started for a given duration (in seconds) and total size of flushed data (in bytes, but it's naturally rounded to 128k). Once either of the limits is hit, tracing is stopped. This is not to disturb system too much including both -- run-time overhead and consumed disk space. Message format is: event id (1 bytes) timestamp (2 bytes) incremental, duration since previous event in usecs, thus holding up to 64 msec length. then goes the payload of arbitrary length. The length is not saved, it's impled that event id defines the payload length. There are 4 service events - buffer-head: placed at the head of each buffer and includes the number of skipped events (4 bytes) and 4-bytes timestamp offset. The latter is 2x times larger than per-event timestamp, since it may take time to recycle a buffer and it can take more than 64 msec - opening: an even that's placed when tracing is enabled, includes shard number and whatever enabler code wants to put there (spoiler: IO tracer puts id:name mapping for registered sched classes) - tick: there's a timer running that puts this 3-bytes event every 32 msec. This is to prevent 2-bytes timestamps from overflowing in case no events happen for that long and as sanity milestones for decoder - terminator: when an event doesn't fit the buffer tail room, 1-bytes event (without timestamp) is placed there to tell decoder to stop decoding current buffer Signed-off-by: Pavel Emelyanov --- CMakeLists.txt | 2 + include/seastar/util/trace.hh | 138 ++++++++++++++++++++++++++++++ src/util/trace.cc | 154 ++++++++++++++++++++++++++++++++++ 3 files changed, 294 insertions(+) create mode 100644 include/seastar/util/trace.hh create mode 100644 src/util/trace.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f2f7b6ce3..2804e8ccbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -688,6 +688,7 @@ add_library (seastar include/seastar/util/closeable.hh include/seastar/util/source_location-compat.hh include/seastar/util/short_streams.hh + include/seastar/util/trace.hh include/seastar/websocket/server.hh src/core/alien.cc src/core/file.cc @@ -777,6 +778,7 @@ add_library (seastar src/util/read_first_line.cc src/util/tmp_file.cc src/util/short_streams.cc + src/util/trace.cc src/websocket/server.cc ) diff --git a/include/seastar/util/trace.hh b/include/seastar/util/trace.hh new file mode 100644 index 0000000000..a895dd2a98 --- /dev/null +++ b/include/seastar/util/trace.hh @@ -0,0 +1,138 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2024 ScyllaDB. + */ + + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace seastar { +namespace internal { + +enum class trace_event { + OPENING, + BUFFER_HEAD, + T800, + TICK, +}; + +size_t tick_event_size(); + +template +struct event_tracer { + static int size() noexcept; + template + static void put(char* buf, Args&&... args) noexcept; +}; + +class tracer { +public: + static constexpr int buffer_size = 128*1024; + using opening_fn = noncopyable_function; + +private: + using clock_t = std::chrono::steady_clock; + + class impl { + static constexpr int max_pool_size = 8; + + tracer& _owner; + clock_t::time_point _prev_ts; + char* _head; + char* _tail; + unsigned int _skipped_events; + temporary_buffer _current; + circular_buffer> _pool; + + file _out; + future<> _flush_fiber = make_ready_future<>(); + circular_buffer> _flush_queue; + condition_variable _flush_signal; + size_t _flush_pos; + + timer _timeout; + size_t _total_size; + const size_t _max_size; + + timer _ticker; + + future<> run_flush_fiber() noexcept; + void flush() noexcept; + void set_current(temporary_buffer) noexcept; + + template + void put(char* buf) noexcept { + *(uint8_t*)buf = static_cast(Ev); + } + + std::chrono::microseconds fetch_add_ts() noexcept { + auto now = clock_t::now(); + auto delta = std::chrono::duration_cast(now - _prev_ts); + _prev_ts = now; + return delta; + } + + public: + impl(tracer&, std::chrono::seconds timeout, size_t max_size, file out, opening_fn); + + template + void trace(Args&&... args) noexcept { + if (_head == nullptr) [[unlikely]] { + _skipped_events++; + return; + } + + size_t hs = sizeof(uint8_t) + sizeof(uint16_t); + size_t s = event_tracer::size() + hs; + if (_head + s > _tail) [[unlikely]] { + flush(); + } + put(_head); + write_le(_head + sizeof(uint8_t), (uint16_t)fetch_add_ts().count()); + event_tracer::put(_head + hs, std::forward(args)...); + _head += s; + } + + future<> stop(); + }; + + std::unique_ptr _impl; + + void disable() noexcept; + +public: + template + void trace(Args&&... args) noexcept { + if (_impl) [[unlikely]] { + _impl->trace(std::forward(args)...); + } + } + + void enable(std::chrono::seconds timeout, size_t max_size, file out, opening_fn); +}; + +} // internal namespace +} // seastar namespace diff --git a/src/util/trace.cc b/src/util/trace.cc new file mode 100644 index 0000000000..8bbddeb670 --- /dev/null +++ b/src/util/trace.cc @@ -0,0 +1,154 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2024 ScyllaDB + */ + +#ifdef SEASTAR_MODULE +module; +#endif + +#include +#include +#include + +namespace seastar { +extern logger seastar_logger; + +namespace internal { + +template<> +struct event_tracer { + static int size() noexcept { return 0; } + static void put(char* buf) noexcept { } +}; + +size_t tick_event_size() { return event_tracer::size(); } + +void tracer::impl::set_current(temporary_buffer buf) noexcept { + _current = std::move(buf); + _head = _current.get_write(); + _tail = _head + _current.size(); + + put(_head); + write_le(_head + 1, (uint32_t)fetch_add_ts().count()); + write_le(_head + 5, (uint32_t)std::exchange(_skipped_events, 0)); + _head += sizeof(uint8_t) + 2 * sizeof(uint32_t); +} + +void tracer::impl::flush() noexcept { + if (_head < _tail) { + put(_head); + } + _flush_queue.push_back(std::move(_current)); + _flush_signal.signal(); + if (_pool.empty()) { + _head = nullptr; + } else { + set_current(std::move(_pool.front())); + _pool.pop_front(); + } + _total_size += buffer_size; + if (_total_size >= _max_size) { + _owner.disable(); + } +} + +future<> tracer::impl::run_flush_fiber() noexcept { + while (true) { + if (!_flush_queue.empty()) { + auto buf = std::move(_flush_queue.front()); + _flush_queue.pop_front(); + co_await _out.dma_write(_flush_pos, buf.get(), buf.size()); + _flush_pos += buf.size(); + if (_head == nullptr) { + set_current(std::move(buf)); + } else { + _pool.push_back(std::move(buf)); + } + continue; + } + if (!_timeout.armed()) { + break; + } + co_await _flush_signal.wait(); + } +} + +tracer::impl::impl(tracer& owner, std::chrono::seconds timeout, size_t max_size, file out, noncopyable_function opening) + : _owner(owner) + , _prev_ts(clock_t::now()) + , _skipped_events(0) + , _out(std::move(out)) + , _flush_pos(0) + , _timeout([this] { _owner.disable(); }) + , _total_size(0) + , _max_size(max_size) + , _ticker([this] mutable { trace(); }) +{ + _flush_queue.reserve(max_pool_size); + + _pool.reserve(max_pool_size); + for (int i = 0; i < max_pool_size; i++) { + auto buf = temporary_buffer::aligned(_out.memory_dma_alignment(), buffer_size); + _pool.emplace_back(std::move(buf)); + } + auto buf = temporary_buffer::aligned(_out.memory_dma_alignment(), buffer_size); + set_current(std::move(buf)); + + put(_head); + write_le(_head + 1, (uint8_t)this_shard_id()); + auto size = opening(_head + 4); + assert(size <= std::numeric_limits::max()); + write_le(_head + 2, (uint16_t)size); + _head += sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint16_t) + size; + + _timeout.arm(timeout); + _flush_fiber = run_flush_fiber(); + + _ticker.arm_periodic(std::chrono::milliseconds(32)); +} + +future<> tracer::impl::stop() { + _ticker.cancel(); + _timeout.cancel(); + if (_head != nullptr && _head != _current.get()) { + flush(); + } + co_await std::move(_flush_fiber); + assert(_flush_queue.empty()); + co_await _out.close(); + seastar_logger.info("Stopped tracing on shard"); +} + +void tracer::enable(std::chrono::seconds timeout, size_t max_size, file out, opening_fn opening) { + if (!_impl) { + _impl = std::make_unique(*this, timeout, max_size, std::move(out), std::move(opening)); + } +} + +void tracer::disable() noexcept { + auto ip = std::move(_impl); + if (ip) { + auto& i = *ip; + (void)i.stop().finally([i = std::move(ip)] {}); + } +} + +} // internal namespace +} // seastar namespace From 9d6ab80fa0abf2243720781d1bdf0373174a0aae Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 28 Aug 2024 20:16:00 +0300 Subject: [PATCH 2/7] app: Binary trace decoder As messages format is determined by the event id, decoder should know exactly the version from this a trace was generated, thus be tagged with the same "release ID" in the repository. Signed-off-by: Pavel Emelyanov --- apps/CMakeLists.txt | 1 + apps/trace/CMakeLists.txt | 24 ++++++ apps/trace/decode.cc | 165 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 apps/trace/CMakeLists.txt create mode 100644 apps/trace/decode.cc diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index a4f34812ce..e343967727 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -54,3 +54,4 @@ add_subdirectory (rpc_tester) add_subdirectory (iotune) add_subdirectory (memcached) add_subdirectory (seawreck) +add_subdirectory (trace) diff --git a/apps/trace/CMakeLists.txt b/apps/trace/CMakeLists.txt new file mode 100644 index 0000000000..deb87c4ba4 --- /dev/null +++ b/apps/trace/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# This file is open source software, licensed to you under the terms +# of the Apache License, Version 2.0 (the "License"). See the NOTICE file +# distributed with this work for additional information regarding copyright +# ownership. You may not use this file except in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# +# Copyright (C) 2024 Scylladb, Ltd. +# + +seastar_add_app (trace_decode + SOURCES decode.cc) diff --git a/apps/trace/decode.cc b/apps/trace/decode.cc new file mode 100644 index 0000000000..565335127c --- /dev/null +++ b/apps/trace/decode.cc @@ -0,0 +1,165 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2024 ScyllaDB + */ +#include +#include +#include +#include +#include +#include +#include + +using namespace seastar; + +size_t event_body_size(internal::trace_event ev) { + switch (ev) { + case internal::trace_event::TICK: + return 0; + case internal::trace_event::BUFFER_HEAD: + case internal::trace_event::OPENING: + case internal::trace_event::T800: + break; + } + fmt::print("Invalid event met ({})\n", static_cast(ev)); + throw std::runtime_error("invalid file content"); + return 0; +} + +template +std::string parse(temporary_buffer body); + +template<> +std::string parse(temporary_buffer body) { + return "TICK"; +} + +std::string parse_event(internal::trace_event ev, temporary_buffer body) { + switch (ev) { + case internal::trace_event::TICK: + return parse(std::move(body)); + case internal::trace_event::BUFFER_HEAD: + case internal::trace_event::OPENING: + case internal::trace_event::T800: + break; + } + return ""; +} + +std::string format_ts(uint64_t timestamp) { + // timestamps are in usec + return format("{:03d}.{:06d}", timestamp/1000000, timestamp%1000000); +} + +int main(int ac, char** av) { + namespace bpo = boost::program_options; + + app_template app; + auto opt_add = app.add_options(); + opt_add + ("file", bpo::value()->default_value("trace.0.bin"), "file to decode") + ; + + return app.run(ac, av, [&] { + return seastar::async([&] { + auto& opts = app.configuration(); + auto& fname = opts["file"].as(); + auto f = open_file_dma(fname, open_flags::ro).get(); + auto in = make_file_input_stream(std::move(f)); + size_t consumed = 0; + uint64_t timestamp = 0; + + while (true) { + auto tb = in.read_exactly(sizeof(uint8_t)).get(); + if (!tb) { + break; + } + const char* buf = tb.get(); + auto ev = static_cast(*(uint8_t*)buf); + + if (ev == internal::trace_event::OPENING) { + tb = in.read_exactly(sizeof(uint8_t) + sizeof(uint16_t)).get(); + if (!tb) { + fmt::print("corrupted buffer -- no opening body\n"); + break; + } + buf = tb.get(); + auto shard = read_le(buf); + auto size = read_le(buf + 1); + + tb = in.read_exactly(size).get(); + if (!tb) { + fmt::print("corrupted buffer -- no opening payload (size={})\n", size); + break; + } + buf = tb.get(); + fmt::print("{:08x}:- OPENING: shard={} text={}\n", consumed, shard, std::string(buf, size)); + consumed += sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint16_t) + size; + continue; + } + + if (ev == internal::trace_event::BUFFER_HEAD) { + tb = in.read_exactly(2 * sizeof(uint32_t)).get(); + if (!tb) { + fmt::print("corrupted buffer -- no buffer-head body\n"); + break; + } + buf = tb.get(); + auto ts = read_le(buf); + timestamp += ts; + auto skip = read_le(buf + sizeof(uint32_t)); + fmt::print("{:08x}:{} --- buffer (skipped {}) ---\n", consumed, format_ts(timestamp), skip); + consumed = sizeof(uint8_t) + 2 * sizeof(uint32_t); + continue; + } + + if (ev == internal::trace_event::T800) { + auto rem = internal::tracer::buffer_size - consumed - sizeof(uint8_t); + fmt::print("{:08x}:- REM -- skip {} bytes\n", consumed, rem); + if (rem > 0) { + in.read_exactly(rem).get(); + } + consumed = 0; + continue; + } + + tb = in.read_exactly(sizeof(uint16_t)).get(); + if (!tb) { + fmt::print("corrupted buffer -- no timestamp\n"); + break; + } + buf = tb.get(); + auto ts = read_le(buf); + timestamp += ts; + + auto sz = event_body_size(ev); + if (sz > 0) { + tb = in.read_exactly(sz).get(); + if (!tb) { + fmt::print("corrupted file -- no event body\n"); + } + } else { + tb = temporary_buffer(); + } + fmt::print("{:08x}:{} {}\n", consumed, format_ts(timestamp), parse_event(ev, std::move(tb))); + consumed += sz + sizeof(uint8_t) + sizeof(uint16_t); + } + }); + }); +} From f1d51339dd333706fd2a130db39019b9351ebb47 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 28 Aug 2024 17:45:54 +0300 Subject: [PATCH 3/7] io: Wire binary tracer into IO queue First trace generator is going to be IO queue code, so put the tracer object on board and introduce the enabler method. Events will come in the next patch. Signed-off-by: Pavel Emelyanov --- include/seastar/core/io_queue.hh | 11 +++++++++++ src/core/io_queue.cc | 15 +++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh index 963cdd1cf5..bf42cb8b49 100644 --- a/include/seastar/core/io_queue.hh +++ b/include/seastar/core/io_queue.hh @@ -36,6 +36,7 @@ #include #include #include +#include struct io_queue_for_tests; @@ -114,6 +115,8 @@ private: void lower_stall_threshold() noexcept; metrics::metric_groups _metric_groups; + internal::tracer _tracer; + public: using clock_type = std::chrono::steady_clock; @@ -196,6 +199,14 @@ public: request_limits get_request_limits() const noexcept; const config& get_config() const noexcept; + /// @private + template + void trace(Args&&... args) noexcept { + _tracer.trace(std::forward(args)...); + } + + future<> start_tracing(std::chrono::seconds timeout, size_t max_size); + private: static fair_queue::config make_fair_queue_config(const config& cfg, sstring label); void register_stats(sstring name, priority_class_data& pc); diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index 52d38aadd3..938413e1a9 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -1030,4 +1030,19 @@ void io_queue::unthrottle_priority_class(const priority_class_data& pc) noexcept } } +future<> io_queue::start_tracing(std::chrono::seconds timeout, size_t max_size) { + auto f = co_await open_file_dma(format("trace.{}.bin", this_shard_id()), open_flags::wo | open_flags::create | open_flags::exclusive); + seastar_logger.info("Started IO-tracing for {}s (max size {})", timeout.count(), max_size); + _tracer.enable(timeout, max_size, std::move(f), [] (char* buf) { + std::string o; + for (int i = 0; i < (int)internal::scheduling_group_count(); i++) { + auto sg = internal::scheduling_group_from_index(i); + o += format("{}:{},", i, sg.short_name()); + } + auto len = o.size() - 1; // trim trailing comma + std::strncpy(buf, o.c_str(), len); + return len; + }); +} + } // seastar namespace From 6e63ea1c13932bfd0115ea704c9ea82adf320026 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 30 Aug 2024 12:55:01 +0300 Subject: [PATCH 4/7] fair_queue: Add tracer reference There's one interesting even that is generated by fair-queue -- when the queue starts waiting for shared capacity. In fact, the output throttler code should exist on IO-queue level (see #2332), but currently it's in the fair-queue, so it will need to generate this event. This patch adds tracer reference on fair-queue for future patching. Signed-off-by: Pavel Emelyanov --- include/seastar/core/fair_queue.hh | 7 ++++++- src/core/fair_queue.cc | 4 +++- src/core/io_queue.cc | 6 +++--- tests/perf/fair_queue_perf.cc | 6 ++++-- tests/unit/fair_queue_test.cc | 4 +++- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh index 546cb00b60..312461ed06 100644 --- a/include/seastar/core/fair_queue.hh +++ b/include/seastar/core/fair_queue.hh @@ -39,6 +39,10 @@ namespace bi = boost::intrusive; namespace seastar { +namespace internal { +class tracer; +} + /// \brief describes a request that passes through the \ref fair_queue. /// /// A ticket is specified by a \c weight and a \c size. For example, one can specify a request of \c weight @@ -350,6 +354,7 @@ private: }; std::optional _pending; + internal::tracer& _tracer; void push_priority_class(priority_class_data& pc) noexcept; void push_priority_class_from_idle(priority_class_data& pc) noexcept; @@ -364,7 +369,7 @@ public: /// Constructs a fair queue with configuration parameters \c cfg. /// /// \param cfg an instance of the class \ref config - explicit fair_queue(fair_group& shared, config cfg); + explicit fair_queue(fair_group& shared, internal::tracer&, config cfg); fair_queue(fair_queue&&) = delete; ~fair_queue(); diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc index f23edf3ded..381f62ffd6 100644 --- a/src/core/fair_queue.cc +++ b/src/core/fair_queue.cc @@ -38,6 +38,7 @@ module seastar; #include #include #include +#include #endif namespace seastar { @@ -156,10 +157,11 @@ bool fair_queue::class_compare::operator() (const priority_class_ptr& lhs, const return lhs->_accumulated > rhs->_accumulated; } -fair_queue::fair_queue(fair_group& group, config cfg) +fair_queue::fair_queue(fair_group& group, internal::tracer& trace, config cfg) : _config(std::move(cfg)) , _group(group) , _group_replenish(clock_type::now()) + , _tracer(trace) { } diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index 938413e1a9..1244f5221f 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -579,11 +579,11 @@ io_queue::io_queue(io_group_ptr group, internal::io_sink& sink) auto& cfg = get_config(); if (cfg.duplex) { static_assert(internal::io_direction_and_length::write_idx == 0); - _streams.emplace_back(_group->_fgs[0], make_fair_queue_config(cfg, "write")); + _streams.emplace_back(_group->_fgs[0], _tracer, make_fair_queue_config(cfg, "write")); static_assert(internal::io_direction_and_length::read_idx == 1); - _streams.emplace_back(_group->_fgs[1], make_fair_queue_config(cfg, "read")); + _streams.emplace_back(_group->_fgs[1], _tracer, make_fair_queue_config(cfg, "read")); } else { - _streams.emplace_back(_group->_fgs[0], make_fair_queue_config(cfg, "rw")); + _streams.emplace_back(_group->_fgs[0], _tracer, make_fair_queue_config(cfg, "rw")); } _averaging_decay_timer.arm_periodic(std::chrono::duration_cast(_group->io_latency_goal() * cfg.averaging_decay_ticks)); diff --git a/tests/perf/fair_queue_perf.cc b/tests/perf/fair_queue_perf.cc index 525b756fd4..d9a21bbabf 100644 --- a/tests/perf/fair_queue_perf.cc +++ b/tests/perf/fair_queue_perf.cc @@ -28,10 +28,12 @@ #include #include #include +#include static constexpr fair_queue::class_id cid = 0; struct local_fq_and_class { + seastar::internal::tracer trace; seastar::fair_group fg; seastar::fair_queue fq; seastar::fair_queue sfq; @@ -46,8 +48,8 @@ struct local_fq_and_class { local_fq_and_class(seastar::fair_group& sfg) : fg(fg_config(), 1) - , fq(fg, seastar::fair_queue::config()) - , sfq(sfg, seastar::fair_queue::config()) + , fq(fg, trace, seastar::fair_queue::config()) + , sfq(sfg, trace, seastar::fair_queue::config()) { fq.register_priority_class(cid, 1); sfq.register_priority_class(cid, 1); diff --git a/tests/unit/fair_queue_test.cc b/tests/unit/fair_queue_test.cc index fe620c7790..0d4fd44e19 100644 --- a/tests/unit/fair_queue_test.cc +++ b/tests/unit/fair_queue_test.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ struct request { }; class test_env { + internal::tracer _trace; fair_group _fg; fair_queue _fq; std::vector _results; @@ -81,7 +83,7 @@ class test_env { public: test_env(unsigned capacity) : _fg(fg_config(capacity), 1) - , _fq(_fg, fq_config()) + , _fq(_fg, _trace, fq_config()) {} // As long as there is a request sitting in the queue, tick() will process From 313ac6cfbb17f66f124d00531564e7ac8a594582 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 28 Aug 2024 17:54:22 +0300 Subject: [PATCH 5/7] io_tester: Add ability to turn tracing ON Add a CLI option that enables tracing after all preparations are done. The workload jobs will then be traced and trace files will be available afterwards. Signed-off-by: Pavel Emelyanov --- apps/io_tester/io_tester.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/io_tester/io_tester.cc b/apps/io_tester/io_tester.cc index 10cc5c2cdc..db945901cd 100644 --- a/apps/io_tester/io_tester.cc +++ b/apps/io_tester/io_tester.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -1114,6 +1115,7 @@ int main(int ac, char** av) { ("duration", bpo::value()->default_value(10), "for how long (in seconds) to run the test") ("conf", bpo::value()->default_value("./conf.yaml"), "YAML file containing benchmark specification") ("keep-files", bpo::value()->default_value(false), "keep test files, next run may re-use them") + ("trace", bpo::value()->default_value(false), "start reactor tracing") ; distributed ctx; @@ -1136,6 +1138,7 @@ int main(int ac, char** av) { } keep_files = opts["keep-files"].as(); + bool trace = opts["trace"].as(); auto& duration = opts["duration"].as(); auto& yaml = opts["conf"].as(); YAML::Node doc = YAML::LoadFile(yaml); @@ -1181,6 +1184,13 @@ int main(int ac, char** av) { ctx.invoke_on_all([] (auto& c) { return c.start(); }).get(); + if (trace) { + std::cout << "Starting tracing..." << std::endl; + auto st = file_stat(storage).get(); + smp::invoke_on_all([st, duration] { + return engine().get_io_queue(st.device_id).start_tracing(std::chrono::seconds(duration), 1ull << 30); + }).get(); + } std::cout << "Starting evaluation..." << std::endl; ctx.invoke_on_all([] (auto& c) { return c.issue_requests(); From 197f436017ec76fde3d04ba05c3a763cbbbd26af Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 28 Aug 2024 17:18:42 +0300 Subject: [PATCH 6/7] io: Add trace events There are 3 interesting events -- when a request is queued, when it's dispatched and when it completes. The first one comes with request ID, class ID, direction and length. To save some space on the trace buffer the length is trimmed down to 512-bytes blocks, so 2 bytes are enough for up to 32-MB requests. The request "ID" is derived from io_desc_read_write pointer as it uniquely identifes a request while it's in-flight. For that, the pointer is disguised -- shifted right for 3 bits thus cutting off always zero bits, and then converted to 32-bit number. This cuts off the upper part of the pointer, but it seems to be "unique enough" for one shard. This even is 1 (event id) + 2 (timestamp) + 1 (direction) + 1 (class) + 2 (length in blocks) + 4 (id) = 11 bytes Dispatch and complete only include the request ID field. Its length is thus 7 bytes (event id + timestamps + request id). Totally, each request generates 25 bytes in the buffer. Signed-off-by: Pavel Emelyanov --- apps/trace/decode.cc | 45 +++++++++++++ include/seastar/core/internal/io_trace.hh | 79 +++++++++++++++++++++++ include/seastar/util/trace.hh | 6 ++ src/core/io_queue.cc | 4 ++ 4 files changed, 134 insertions(+) create mode 100644 include/seastar/core/internal/io_trace.hh diff --git a/apps/trace/decode.cc b/apps/trace/decode.cc index 565335127c..af3e4fc2d4 100644 --- a/apps/trace/decode.cc +++ b/apps/trace/decode.cc @@ -32,6 +32,14 @@ size_t event_body_size(internal::trace_event ev) { switch (ev) { case internal::trace_event::TICK: return 0; + case internal::trace_event::IO_POLL: + return internal::event_tracer::size(); + case internal::trace_event::IO_QUEUE: + return internal::event_tracer::size(); + case internal::trace_event::IO_DISPATCH: + return internal::event_tracer::size(); + case internal::trace_event::IO_COMPLETE: + return internal::event_tracer::size(); case internal::trace_event::BUFFER_HEAD: case internal::trace_event::OPENING: case internal::trace_event::T800: @@ -50,10 +58,47 @@ std::string parse(temporary_buffer body) { return "TICK"; } +template<> +std::string parse(temporary_buffer body) { + return "IO_POLL"; +} + +template<> +std::string parse(temporary_buffer body) { + const char* b = body.get(); + auto rw = read_le(b + 0); + auto cid = read_le(b + 1); + auto len = read_le(b + 2); + auto rq = read_le(b + 4); + return format("IO Q {:04x} {} class {} {}", rq, rw == 0 ? "w" : "r", cid, len << 9); +} + +template<> +std::string parse(temporary_buffer body) { + const char* b = body.get(); + auto rq = read_le(b); + return format("IO D {:04x}", rq); +} + +template<> +std::string parse(temporary_buffer body) { + const char* b = body.get(); + auto rq = read_le(b); + return format("IO C {:04x}", rq); +} + std::string parse_event(internal::trace_event ev, temporary_buffer body) { switch (ev) { case internal::trace_event::TICK: return parse(std::move(body)); + case internal::trace_event::IO_POLL: + return parse(std::move(body)); + case internal::trace_event::IO_QUEUE: + return parse(std::move(body)); + case internal::trace_event::IO_DISPATCH: + return parse(std::move(body)); + case internal::trace_event::IO_COMPLETE: + return parse(std::move(body)); case internal::trace_event::BUFFER_HEAD: case internal::trace_event::OPENING: case internal::trace_event::T800: diff --git a/include/seastar/core/internal/io_trace.hh b/include/seastar/core/internal/io_trace.hh new file mode 100644 index 0000000000..7361bab144 --- /dev/null +++ b/include/seastar/core/internal/io_trace.hh @@ -0,0 +1,79 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2024 ScyllaDB. + */ + + +#pragma once +#include + +namespace seastar { +class io_desc_read_write; + +namespace internal { + +template<> +struct event_tracer { + static int size() noexcept { return 0; } + static void put(char* buf) noexcept { } +}; + +inline uint32_t disguise_request_ptr(const io_desc_read_write& desc) noexcept { + return reinterpret_cast(&desc) >> 3; +} + +template<> +struct event_tracer { + static int size() noexcept { + return sizeof(uint8_t) // direction idx + + sizeof(uint8_t) // class id + + sizeof(uint16_t) // length in blocks + + sizeof(uint32_t); // request "id" + } + static void put(char* buf, const io_desc_read_write& desc, unsigned class_id, int rw_idx, size_t nr_blocks) noexcept { + write_le(buf + 0, (uint8_t)rw_idx); + write_le(buf + 1, (uint8_t)class_id); + write_le(buf + 2, (uint16_t)nr_blocks); // up to 32Mb + write_le(buf + 4, disguise_request_ptr(desc)); + } +}; + +template<> +struct event_tracer { + static int size() noexcept { + return sizeof(uint32_t); // request "id" + } + static void put(char* buf, const io_desc_read_write& desc) noexcept { + write_le(buf, disguise_request_ptr(desc)); + } +}; + +template<> +struct event_tracer { + static int size() noexcept { + return sizeof(uint32_t); // request "id" + } + static void put(char* buf, const io_desc_read_write& desc) noexcept { + write_le(buf, disguise_request_ptr(desc)); + } +}; + +} // internal namespace +} // seastar namespace diff --git a/include/seastar/util/trace.hh b/include/seastar/util/trace.hh index a895dd2a98..fa14c5993d 100644 --- a/include/seastar/util/trace.hh +++ b/include/seastar/util/trace.hh @@ -37,6 +37,12 @@ enum class trace_event { BUFFER_HEAD, T800, TICK, + + // IO events + IO_POLL, + IO_QUEUE, + IO_DISPATCH, + IO_COMPLETE, }; size_t tick_event_size(); diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc index 1244f5221f..cf88d670f3 100644 --- a/src/core/io_queue.cc +++ b/src/core/io_queue.cc @@ -48,6 +48,7 @@ module seastar; #include #include #include +#include #endif namespace seastar { @@ -231,6 +232,7 @@ class io_desc_read_write final : public io_completion { , _iovs(std::move(iovs)) { io_log.trace("dev {} : req {} queue len {} capacity {}", _ioq.dev_id(), fmt::ptr(this), _dnl.length(), _fq_capacity); + _ioq.trace(*this, _pclass.fq_class(), _dnl.rw_idx(), _dnl.length() >> io_queue::block_size_shift); } virtual void set_exception(std::exception_ptr eptr) noexcept override { @@ -243,6 +245,7 @@ class io_desc_read_write final : public io_completion { virtual void complete(size_t res) noexcept override { io_log.trace("dev {} : req {} complete", _ioq.dev_id(), fmt::ptr(this)); + _ioq.trace(*this); auto now = io_queue::clock_type::now(); auto delay = std::chrono::duration_cast>(now - _ts); _pclass.on_complete(delay); @@ -259,6 +262,7 @@ class io_desc_read_write final : public io_completion { void dispatch() noexcept { io_log.trace("dev {} : req {} submit", _ioq.dev_id(), fmt::ptr(this)); + _ioq.trace(*this); auto now = io_queue::clock_type::now(); _pclass.on_dispatch(_dnl, std::chrono::duration_cast>(now - _ts)); _ts = now; From cdd9734288d516d489fc90e719e5a1de203c448e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 30 Aug 2024 12:55:19 +0300 Subject: [PATCH 7/7] fq: Add trace event There's only one interesting event for it -- when the queue goes "pending" state waiting for shared capacity. No arguments here, so it just takes 3 bytes in the buffer. Signed-off-by: Pavel Emelyanov --- apps/trace/decode.cc | 10 ++++++ include/seastar/core/internal/fq_trace.hh | 37 +++++++++++++++++++++++ include/seastar/util/trace.hh | 1 + src/core/fair_queue.cc | 1 + 4 files changed, 49 insertions(+) create mode 100644 include/seastar/core/internal/fq_trace.hh diff --git a/apps/trace/decode.cc b/apps/trace/decode.cc index af3e4fc2d4..6522d62ca5 100644 --- a/apps/trace/decode.cc +++ b/apps/trace/decode.cc @@ -25,6 +25,7 @@ #include #include #include +#include using namespace seastar; @@ -40,6 +41,8 @@ size_t event_body_size(internal::trace_event ev) { return internal::event_tracer::size(); case internal::trace_event::IO_COMPLETE: return internal::event_tracer::size(); + case internal::trace_event::FQ_WAIT_CAPACITY: + return internal::event_tracer::size(); case internal::trace_event::BUFFER_HEAD: case internal::trace_event::OPENING: case internal::trace_event::T800: @@ -87,6 +90,11 @@ std::string parse(temporary_buffer bod return format("IO C {:04x}", rq); } +template<> +std::string parse(temporary_buffer body) { + return "FQ WAIT"; +} + std::string parse_event(internal::trace_event ev, temporary_buffer body) { switch (ev) { case internal::trace_event::TICK: @@ -99,6 +107,8 @@ std::string parse_event(internal::trace_event ev, temporary_buffer body) { return parse(std::move(body)); case internal::trace_event::IO_COMPLETE: return parse(std::move(body)); + case internal::trace_event::FQ_WAIT_CAPACITY: + return parse(std::move(body)); case internal::trace_event::BUFFER_HEAD: case internal::trace_event::OPENING: case internal::trace_event::T800: diff --git a/include/seastar/core/internal/fq_trace.hh b/include/seastar/core/internal/fq_trace.hh new file mode 100644 index 0000000000..2d84c7137e --- /dev/null +++ b/include/seastar/core/internal/fq_trace.hh @@ -0,0 +1,37 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2024 ScyllaDB. + */ + + +#pragma once +#include + +namespace seastar { +namespace internal { + +template<> +struct event_tracer { + static int size() noexcept { return 0; } + static void put(char* buf) noexcept { } +}; + +} // internal namespace +} // seastar namespace diff --git a/include/seastar/util/trace.hh b/include/seastar/util/trace.hh index fa14c5993d..939953890c 100644 --- a/include/seastar/util/trace.hh +++ b/include/seastar/util/trace.hh @@ -43,6 +43,7 @@ enum class trace_event { IO_QUEUE, IO_DISPATCH, IO_COMPLETE, + FQ_WAIT_CAPACITY, }; size_t tick_event_size(); diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc index 381f62ffd6..3af2bdfef5 100644 --- a/src/core/fair_queue.cc +++ b/src/core/fair_queue.cc @@ -249,6 +249,7 @@ auto fair_queue::grab_capacity(const fair_queue_entry& ent) noexcept -> grab_res capacity_t cap = ent._capacity; capacity_t want_head = _group.grab_capacity(cap); if (_group.capacity_deficiency(want_head)) { + _tracer.trace(); _pending.emplace(want_head, cap); return grab_result::pending; }