diff --git a/media_kit_video/windows/d3d11_renderer.h b/media_kit_video/windows/d3d11_renderer.h index 88b628d49..7b67ccd35 100644 --- a/media_kit_video/windows/d3d11_renderer.h +++ b/media_kit_video/windows/d3d11_renderer.h @@ -27,10 +27,14 @@ // // The MailboxSwapChain is passed directly to mpv as the IDXGISwapChain* in // mpv_dxgi_init_params. mpv calls GetBuffer(0, ...) to obtain a render -// target, renders into it, and flushes. The plugin then calls -// ProducerCommit() to atomically publish the frame. Flutter's -// GpuSurfaceTexture callback calls ConsumerAcquire() to receive the DXGI -// shared HANDLE of the newest complete frame — with no copy and no OS lock. +// target and submits GPU work into it. The plugin then calls +// ProducerCommit(), which (a) signals a fence on the submitted work, +// (b) non-blockingly checks the *previous* frame's fence and, if already +// GPU-complete, promotes it to latest_completed_slot_, and (c) atomically +// publishes write_slot_ as the new pending frame. Flutter's +// GpuSurfaceTexture callback calls ConsumerAcquire() — a single acquire +// load of latest_completed_slot_ — to receive the DXGI shared HANDLE of +// the newest confirmed frame, with no copy, no flush, and no OS lock. class D3D11Renderer { public: int32_t width() const { return width_; } @@ -57,11 +61,14 @@ class D3D11Renderer { void SetSize(int32_t width, int32_t height); // Called from the producer thread (mpv thread pool) after - // mpv_render_context_render returns. Publishes the rendered frame. + // mpv_render_context_render returns. Signals the frame fence, then + // non-blockingly attempts to promote the previous pending frame to + // latest_completed_slot_, and finally publishes the new pending frame. void ProducerCommit(); // Called from the consumer thread (Flutter GpuSurfaceTexture callback). - // Returns the DXGI shared HANDLE of the most recent complete frame. + // Returns the DXGI shared HANDLE of the most recent fence-confirmed frame + // via a single atomic load — no fence poll, no flush, no stall. HANDLE ConsumerAcquire(); // Returns the DXGI shared HANDLE for the current read slot without diff --git a/media_kit_video/windows/mailbox_swap_chain.cc b/media_kit_video/windows/mailbox_swap_chain.cc index 39c6098d7..31b25bbce 100644 --- a/media_kit_video/windows/mailbox_swap_chain.cc +++ b/media_kit_video/windows/mailbox_swap_chain.cc @@ -111,51 +111,92 @@ MailboxSwapChain::GetDesc(DXGI_SWAP_CHAIN_DESC* pDesc) { } void MailboxSwapChain::ProducerCommit() { + auto& ws = slots_[write_slot_]; + context4_->Signal(ws.fence.Get(), ++ws.fence_value); + + // This runs one full render-cycle after the *previous* Signal was enqueued. + // By then the D3D11 runtime has had ample opportunity to submit the prior + // command buffer to the GPU, so GetCompletedValue() is far more likely to + // have advanced than it would be inside ConsumerAcquire (which can be + // called microseconds after the Signal). The check is non-blocking: if + // the fence isn't done yet, we simply leave latest_completed_slot_ as-is + // and try again next frame. + // + // On success we do a combined promotion CAS on mailbox_state_: + // (has_pending=1, pending=P, completed=C, free=F) + // → (has_pending=0, extra=C, completed=P, free=F) + // then store latest_completed_slot_ = P with release ordering so that + // ConsumerAcquire's acquire load cannot observe P before mailbox_state_ + // reflects P in the 'completed' role (i.e., protected from the producer). { - auto& ws = slots_[write_slot_]; - context4_->Signal(ws.fence.Get(), ++ws.fence_value); + uint32_t snap = mailbox_state_.load(std::memory_order_acquire); + if (snap & (1u << 6)) { // has_pending + const int pend = static_cast((snap >> 4) & 0x3u); + const int comp = static_cast((snap >> 2) & 0x3u); + const int fr = static_cast( snap & 0x3u); + if (slots_[pend].fence->GetCompletedValue() >= + slots_[pend].fence_value) { + const uint32_t snap_desired = + (static_cast(comp) << 4) | + (static_cast(pend) << 2) | + static_cast(fr); + if (mailbox_state_.compare_exchange_strong( + snap, snap_desired, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + latest_completed_slot_.store(pend, std::memory_order_release); + } + // CAS failure means no concurrent writer exists (ProducerCommit is + // called from a single producer thread); the only way it can fail is + // if mailbox_state_ was already has_pending=0, which means nothing + // to promote. Either way, leave latest_completed_slot_ untouched. + } + } } - const uint32_t desired = static_cast(write_slot_) | 0x4u; + // Desired state: + // has_pending = 1 + // pending = write_slot_ (new latest frame) + // completed = old completed_slot (unchanged) + // free = old pending_or_extra (recycled: was old pending or extra_free) + // + // new write_slot_ (producer-private) = old free_slot. uint32_t expected = mailbox_state_.load(std::memory_order_relaxed); - while (!mailbox_state_.compare_exchange_weak( - expected, desired, std::memory_order_release, - std::memory_order_relaxed)) {} - write_slot_ = static_cast(expected & 0x3u); + while (true) { + const int old_free = static_cast( expected & 0x3u); + const int old_completed = static_cast((expected >> 2) & 0x3u); + const int old_poe = static_cast((expected >> 4) & 0x3u); + const uint32_t desired = + (1u << 6) | + (static_cast(write_slot_) << 4) | + (static_cast(old_completed) << 2) | + static_cast(old_poe); + if (mailbox_state_.compare_exchange_weak( + expected, desired, + std::memory_order_release, + std::memory_order_relaxed)) { + write_slot_ = old_free; + break; + } + } } HANDLE MailboxSwapChain::ConsumerAcquire() { - uint32_t expected = mailbox_state_.load(std::memory_order_acquire); - if (!(expected & 0x4u)) { - // No new frame — we already waited for this slot last time we acquired it. - return slots_[read_slot_].shared_handle; - } - const uint32_t desired = static_cast(read_slot_); // dirty=0 - while (!mailbox_state_.compare_exchange_weak( - expected, desired, std::memory_order_acq_rel, - std::memory_order_relaxed)) { - if (!(expected & 0x4u)) - return slots_[read_slot_].shared_handle; - } - read_slot_ = static_cast(expected & 0x3u); - - auto& rs = slots_[read_slot_]; - if (rs.fence->GetCompletedValue() < rs.fence_value) { - if (SUCCEEDED(rs.fence->SetEventOnCompletion(rs.fence_value, - rs.fence_event))) { - ::WaitForSingleObject(rs.fence_event, INFINITE); - } - } - return rs.shared_handle; + // Always return the most recently fence-confirmed frame. + // Advancement is handled exclusively by ProducerCommit (called one full + // render-cycle after each Signal, where fence completion is far more + // likely). + return slots_[latest_completed_slot_.load(std::memory_order_acquire)] + .shared_handle; } HRESULT MailboxSwapChain::Resize(int32_t width, int32_t height) { ReleaseSlots(); width_ = (width > 0) ? width : 1; height_ = (height > 0) ? height : 1; - mailbox_state_.store(2u, std::memory_order_relaxed); + mailbox_state_.store(57u, std::memory_order_relaxed); + latest_completed_slot_.store(2, std::memory_order_relaxed); write_slot_ = 0; - read_slot_ = 1; return AllocateSlots(); } @@ -184,7 +225,7 @@ HRESULT MailboxSwapChain::AllocateSlots() { desc.CPUAccessFlags = 0; desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < 4; ++i) { HRESULT hr = device_->CreateTexture2D(&desc, nullptr, &slots_[i].texture); if (FAILED(hr)) { std::cout << "media_kit: MailboxSwapChain: CreateTexture2D slot " << i @@ -220,17 +261,6 @@ HRESULT MailboxSwapChain::AllocateSlots() { return hr; } slots_[i].fence_value = 0; - - slots_[i].fence_event = - ::CreateEventW(nullptr, /*bManualReset=*/FALSE, /*bInitialState=*/FALSE, - nullptr); - if (!slots_[i].fence_event) { - const HRESULT hrE = HRESULT_FROM_WIN32(::GetLastError()); - std::cout << "media_kit: MailboxSwapChain: CreateEvent slot " << i - << " failed (hr=0x" << std::hex << hrE << std::dec << ")" - << std::endl; - return hrE; - } } return S_OK; @@ -242,9 +272,5 @@ void MailboxSwapChain::ReleaseSlots() { slot.shared_handle = nullptr; slot.fence.Reset(); slot.fence_value = 0; - if (slot.fence_event) { - ::CloseHandle(slot.fence_event); - slot.fence_event = nullptr; - } } } \ No newline at end of file diff --git a/media_kit_video/windows/mailbox_swap_chain.h b/media_kit_video/windows/mailbox_swap_chain.h index 5c1eaacd9..1a5b8b24d 100644 --- a/media_kit_video/windows/mailbox_swap_chain.h +++ b/media_kit_video/windows/mailbox_swap_chain.h @@ -18,14 +18,25 @@ #include #include -// Minimal IDXGISwapChain facade backed by a lock-free triple-buffer mailbox. +// Minimal IDXGISwapChain facade backed by a lock-free 4-slot mailbox with +// a last-completed-frame cache. // -// Three BGRA8 textures are kept, each with a DXGI shared HANDLE. +// Four BGRA8 textures are kept, each with a DXGI shared HANDLE. // mailbox_state_ is a single atomic: -// bits [1:0] slot index in the mailbox (0-2) -// bit [2] dirty flag: 1 = producer has committed a new frame +// bits [1:0] = free_slot (0-3): producer takes this for the next frame +// bits [3:2] = completed_slot (0-3): most recent fence-confirmed frame; +// safe Consumer fallback at any time +// bits [5:4] = pending_or_extra (0-3): has_pending=1 → latest submitted frame +// (fence may still be in-flight); +// has_pending=0 → second free slot +// bit [6] = has_pending : 1 = a new frame is waiting to be consumed // -// {write_slot_, mailbox slot, read_slot_} is always a permutation of {0,1,2}. +// 4-slot invariant (all roles are always distinct): +// has_pending=1: write_slot_(private) | free | pending | completed = 4 slots +// has_pending=0: write_slot_(private) | free | extra_free | completed = 4 slots +// +// Initial value 57u = 0b0_11_10_01: +// has_pending=0, extra_free=3, completed=2, free=1, write_slot_=0 (private) class MailboxSwapChain final : public IDXGISwapChain { public: // Returns an AddRef'd pointer (ref count = 1). device must outlive this. @@ -95,19 +106,28 @@ class MailboxSwapChain final : public IDXGISwapChain { } // Called from the producer thread after mpv_render_context_render returns. + // In addition to publishing write_slot_ as the new pending frame, it + // non-blockingly polls the *previous* pending frame's fence and, if the GPU + // has already completed it, promotes it to completed and updates + // latest_completed_slot_ (release store). This is the sole site that + // advances latest_completed_slot_; ConsumerAcquire never touches the fence. void ProducerCommit(); // Called from the consumer thread (Flutter GpuSurfaceTexture callback). - // Returns the DXGI shared HANDLE of the most recent complete frame. + // Returns the DXGI shared HANDLE of the most recent fence-confirmed frame. + // Implementation is a single acquire load of latest_completed_slot_ — + // no CAS, no fence poll, no flush, no stall, no KeyedMutex. HANDLE ConsumerAcquire(); // Recreates all three texture slots at the new dimensions. // Must only be called from the producer thread with no active consumer. HRESULT Resize(int32_t width, int32_t height); - // Returns the current read-slot HANDLE without advancing mailbox state. + // Returns the latest GPU-confirmed HANDLE without advancing mailbox state. + // Safe to call before the consumer thread starts. HANDLE ReadHandleSnapshot() const { - return slots_[read_slot_].shared_handle; + return slots_[latest_completed_slot_.load(std::memory_order_acquire)] + .shared_handle; } int32_t width() const { return width_; } @@ -127,7 +147,6 @@ class MailboxSwapChain final : public IDXGISwapChain { Microsoft::WRL::ComPtr texture; HANDLE shared_handle = nullptr; Microsoft::WRL::ComPtr fence; - HANDLE fence_event = nullptr; uint64_t fence_value = 0; }; @@ -137,13 +156,20 @@ class MailboxSwapChain final : public IDXGISwapChain { int32_t width_ = 1; int32_t height_ = 1; - TextureSlot slots_[3]; + TextureSlot slots_[4]; + + // Lock-free mailbox state — see bit-field comment at top of class. + // Initial value 57u = 0b0_11_10_01. + std::atomic mailbox_state_{57u}; - // Lock-free mailbox: bits [1:0] = slot index (0-2), bit [2] = dirty; init = 2u. - std::atomic mailbox_state_{2u}; + // Cache of the most recently fence-confirmed completed slot. + // ConsumerAcquire reads this directly (one atomic load, no CAS, no fence + // poll). Updated by ProducerCommit after a successful non-blocking + // pending→completed promotion. Initialised to 2, which matches the + // 'completed' field in mailbox_state_'s initial value 57u. + std::atomic latest_completed_slot_{2}; int write_slot_ = 0; // producer-private - int read_slot_ = 1; // consumer-private std::atomic ref_count_{1u}; }; diff --git a/media_kit_video/windows/video_output.cc b/media_kit_video/windows/video_output.cc index 837971f35..c55e29bff 100644 --- a/media_kit_video/windows/video_output.cc +++ b/media_kit_video/windows/video_output.cc @@ -166,8 +166,6 @@ void VideoOutput::Render() { if (d3d11_renderer_ != nullptr) { mpv_render_context_render(render_context_, nullptr); mpv_render_context_report_swap(render_context_); - // Atomically publish the rendered slot to the mailbox so that Flutter's - // GpuSurfaceTexture callback can import it without a copy. d3d11_renderer_->ProducerCommit(); } // S/W @@ -299,8 +297,8 @@ void VideoOutput::Resize(int64_t required_width, int64_t required_height) { auto texture = std::make_unique(); texture->struct_size = sizeof(FlutterDesktopGpuSurfaceDescriptor); - // Seed with the current read-slot handle so Flutter has a valid surface - // even before the first mpv frame is committed. + // Seed with the latest-completed-slot handle so Flutter has a valid + // surface even before the first mpv frame is committed. texture->handle = d3d11_renderer_->ReadHandleSnapshot(); texture->width = texture->visible_width = d3d11_renderer_->width(); texture->height = texture->visible_height = d3d11_renderer_->height();