Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions media_kit_video/windows/d3d11_renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@
//
// The MailboxSwapChain is passed directly to mpv as the IDXGISwapChain* in
// mpv_dxgi_init_params. mpv calls GetBuffer(0, ...) to obtain a render
// target, renders into it, and flushes. The plugin then calls
// ProducerCommit() to atomically publish the frame. Flutter's
// GpuSurfaceTexture callback calls ConsumerAcquire() to receive the DXGI
// shared HANDLE of the newest complete frame — with no copy and no OS lock.
// target and submits GPU work into it. The plugin then calls
// ProducerCommit(), which (a) signals a fence on the submitted work,
// (b) non-blockingly checks the *previous* frame's fence and, if already
// GPU-complete, promotes it to latest_completed_slot_, and (c) atomically
// publishes write_slot_ as the new pending frame. Flutter's
// GpuSurfaceTexture callback calls ConsumerAcquire() — a single acquire
// load of latest_completed_slot_ — to receive the DXGI shared HANDLE of
// the newest confirmed frame, with no copy, no flush, and no OS lock.
class D3D11Renderer {
public:
int32_t width() const { return width_; }
Expand All @@ -57,11 +61,14 @@ class D3D11Renderer {
void SetSize(int32_t width, int32_t height);

// Called from the producer thread (mpv thread pool) after
// mpv_render_context_render returns. Publishes the rendered frame.
// mpv_render_context_render returns. Signals the frame fence, then
// non-blockingly attempts to promote the previous pending frame to
// latest_completed_slot_, and finally publishes the new pending frame.
void ProducerCommit();

// Called from the consumer thread (Flutter GpuSurfaceTexture callback).
// Returns the DXGI shared HANDLE of the most recent complete frame.
// Returns the DXGI shared HANDLE of the most recent fence-confirmed frame
// via a single atomic load — no fence poll, no flush, no stall.
HANDLE ConsumerAcquire();

// Returns the DXGI shared HANDLE for the current read slot without
Expand Down
120 changes: 73 additions & 47 deletions media_kit_video/windows/mailbox_swap_chain.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,51 +111,92 @@ MailboxSwapChain::GetDesc(DXGI_SWAP_CHAIN_DESC* pDesc) {
}

void MailboxSwapChain::ProducerCommit() {
auto& ws = slots_[write_slot_];
context4_->Signal(ws.fence.Get(), ++ws.fence_value);

// This runs one full render-cycle after the *previous* Signal was enqueued.
// By then the D3D11 runtime has had ample opportunity to submit the prior
// command buffer to the GPU, so GetCompletedValue() is far more likely to
// have advanced than it would be inside ConsumerAcquire (which can be
// called microseconds after the Signal). The check is non-blocking: if
// the fence isn't done yet, we simply leave latest_completed_slot_ as-is
// and try again next frame.
//
// On success we do a combined promotion CAS on mailbox_state_:
// (has_pending=1, pending=P, completed=C, free=F)
// → (has_pending=0, extra=C, completed=P, free=F)
// then store latest_completed_slot_ = P with release ordering so that
// ConsumerAcquire's acquire load cannot observe P before mailbox_state_
// reflects P in the 'completed' role (i.e., protected from the producer).
{
auto& ws = slots_[write_slot_];
context4_->Signal(ws.fence.Get(), ++ws.fence_value);
uint32_t snap = mailbox_state_.load(std::memory_order_acquire);
if (snap & (1u << 6)) { // has_pending
const int pend = static_cast<int>((snap >> 4) & 0x3u);
const int comp = static_cast<int>((snap >> 2) & 0x3u);
const int fr = static_cast<int>( snap & 0x3u);
if (slots_[pend].fence->GetCompletedValue() >=
slots_[pend].fence_value) {
const uint32_t snap_desired =
(static_cast<uint32_t>(comp) << 4) |
(static_cast<uint32_t>(pend) << 2) |
static_cast<uint32_t>(fr);
if (mailbox_state_.compare_exchange_strong(
snap, snap_desired,
std::memory_order_acq_rel,
std::memory_order_relaxed)) {
latest_completed_slot_.store(pend, std::memory_order_release);
}
// CAS failure means no concurrent writer exists (ProducerCommit is
// called from a single producer thread); the only way it can fail is
// if mailbox_state_ was already has_pending=0, which means nothing
// to promote. Either way, leave latest_completed_slot_ untouched.
}
}
}

const uint32_t desired = static_cast<uint32_t>(write_slot_) | 0x4u;
// Desired state:
// has_pending = 1
// pending = write_slot_ (new latest frame)
// completed = old completed_slot (unchanged)
// free = old pending_or_extra (recycled: was old pending or extra_free)
//
// new write_slot_ (producer-private) = old free_slot.
uint32_t expected = mailbox_state_.load(std::memory_order_relaxed);
while (!mailbox_state_.compare_exchange_weak(
expected, desired, std::memory_order_release,
std::memory_order_relaxed)) {}
write_slot_ = static_cast<int>(expected & 0x3u);
while (true) {
const int old_free = static_cast<int>( expected & 0x3u);
const int old_completed = static_cast<int>((expected >> 2) & 0x3u);
const int old_poe = static_cast<int>((expected >> 4) & 0x3u);
const uint32_t desired =
(1u << 6) |
(static_cast<uint32_t>(write_slot_) << 4) |
(static_cast<uint32_t>(old_completed) << 2) |
static_cast<uint32_t>(old_poe);
if (mailbox_state_.compare_exchange_weak(
expected, desired,
std::memory_order_release,
std::memory_order_relaxed)) {
write_slot_ = old_free;
break;
}
}
}

HANDLE MailboxSwapChain::ConsumerAcquire() {
uint32_t expected = mailbox_state_.load(std::memory_order_acquire);
if (!(expected & 0x4u)) {
// No new frame — we already waited for this slot last time we acquired it.
return slots_[read_slot_].shared_handle;
}
const uint32_t desired = static_cast<uint32_t>(read_slot_); // dirty=0
while (!mailbox_state_.compare_exchange_weak(
expected, desired, std::memory_order_acq_rel,
std::memory_order_relaxed)) {
if (!(expected & 0x4u))
return slots_[read_slot_].shared_handle;
}
read_slot_ = static_cast<int>(expected & 0x3u);

auto& rs = slots_[read_slot_];
if (rs.fence->GetCompletedValue() < rs.fence_value) {
if (SUCCEEDED(rs.fence->SetEventOnCompletion(rs.fence_value,
rs.fence_event))) {
::WaitForSingleObject(rs.fence_event, INFINITE);
}
}
return rs.shared_handle;
// Always return the most recently fence-confirmed frame.
// Advancement is handled exclusively by ProducerCommit (called one full
// render-cycle after each Signal, where fence completion is far more
// likely).
return slots_[latest_completed_slot_.load(std::memory_order_acquire)]
.shared_handle;
}

HRESULT MailboxSwapChain::Resize(int32_t width, int32_t height) {
ReleaseSlots();
width_ = (width > 0) ? width : 1;
height_ = (height > 0) ? height : 1;
mailbox_state_.store(2u, std::memory_order_relaxed);
mailbox_state_.store(57u, std::memory_order_relaxed);
latest_completed_slot_.store(2, std::memory_order_relaxed);
write_slot_ = 0;
read_slot_ = 1;
return AllocateSlots();
}

Expand Down Expand Up @@ -184,7 +225,7 @@ HRESULT MailboxSwapChain::AllocateSlots() {
desc.CPUAccessFlags = 0;
desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED;

for (int i = 0; i < 3; ++i) {
for (int i = 0; i < 4; ++i) {
HRESULT hr = device_->CreateTexture2D(&desc, nullptr, &slots_[i].texture);
if (FAILED(hr)) {
std::cout << "media_kit: MailboxSwapChain: CreateTexture2D slot " << i
Expand Down Expand Up @@ -220,17 +261,6 @@ HRESULT MailboxSwapChain::AllocateSlots() {
return hr;
}
slots_[i].fence_value = 0;

slots_[i].fence_event =
::CreateEventW(nullptr, /*bManualReset=*/FALSE, /*bInitialState=*/FALSE,
nullptr);
if (!slots_[i].fence_event) {
const HRESULT hrE = HRESULT_FROM_WIN32(::GetLastError());
std::cout << "media_kit: MailboxSwapChain: CreateEvent slot " << i
<< " failed (hr=0x" << std::hex << hrE << std::dec << ")"
<< std::endl;
return hrE;
}
}

return S_OK;
Expand All @@ -242,9 +272,5 @@ void MailboxSwapChain::ReleaseSlots() {
slot.shared_handle = nullptr;
slot.fence.Reset();
slot.fence_value = 0;
if (slot.fence_event) {
::CloseHandle(slot.fence_event);
slot.fence_event = nullptr;
}
}
}
52 changes: 39 additions & 13 deletions media_kit_video/windows/mailbox_swap_chain.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,25 @@
#include <atomic>
#include <cstdint>

// Minimal IDXGISwapChain facade backed by a lock-free triple-buffer mailbox.
// Minimal IDXGISwapChain facade backed by a lock-free 4-slot mailbox with
// a last-completed-frame cache.
//
// Three BGRA8 textures are kept, each with a DXGI shared HANDLE.
// Four BGRA8 textures are kept, each with a DXGI shared HANDLE.
// mailbox_state_ is a single atomic<uint32_t>:
// bits [1:0] slot index in the mailbox (0-2)
// bit [2] dirty flag: 1 = producer has committed a new frame
// bits [1:0] = free_slot (0-3): producer takes this for the next frame
// bits [3:2] = completed_slot (0-3): most recent fence-confirmed frame;
// safe Consumer fallback at any time
// bits [5:4] = pending_or_extra (0-3): has_pending=1 → latest submitted frame
// (fence may still be in-flight);
// has_pending=0 → second free slot
// bit [6] = has_pending : 1 = a new frame is waiting to be consumed
//
// {write_slot_, mailbox slot, read_slot_} is always a permutation of {0,1,2}.
// 4-slot invariant (all roles are always distinct):
// has_pending=1: write_slot_(private) | free | pending | completed = 4 slots
// has_pending=0: write_slot_(private) | free | extra_free | completed = 4 slots
//
// Initial value 57u = 0b0_11_10_01:
// has_pending=0, extra_free=3, completed=2, free=1, write_slot_=0 (private)
class MailboxSwapChain final : public IDXGISwapChain {
public:
// Returns an AddRef'd pointer (ref count = 1). device must outlive this.
Expand Down Expand Up @@ -95,19 +106,28 @@ class MailboxSwapChain final : public IDXGISwapChain {
}

// Called from the producer thread after mpv_render_context_render returns.
// In addition to publishing write_slot_ as the new pending frame, it
// non-blockingly polls the *previous* pending frame's fence and, if the GPU
// has already completed it, promotes it to completed and updates
// latest_completed_slot_ (release store). This is the sole site that
// advances latest_completed_slot_; ConsumerAcquire never touches the fence.
void ProducerCommit();

// Called from the consumer thread (Flutter GpuSurfaceTexture callback).
// Returns the DXGI shared HANDLE of the most recent complete frame.
// Returns the DXGI shared HANDLE of the most recent fence-confirmed frame.
// Implementation is a single acquire load of latest_completed_slot_ —
// no CAS, no fence poll, no flush, no stall, no KeyedMutex.
HANDLE ConsumerAcquire();

// Recreates all three texture slots at the new dimensions.
// Must only be called from the producer thread with no active consumer.
HRESULT Resize(int32_t width, int32_t height);

// Returns the current read-slot HANDLE without advancing mailbox state.
// Returns the latest GPU-confirmed HANDLE without advancing mailbox state.
// Safe to call before the consumer thread starts.
HANDLE ReadHandleSnapshot() const {
return slots_[read_slot_].shared_handle;
return slots_[latest_completed_slot_.load(std::memory_order_acquire)]
.shared_handle;
}

int32_t width() const { return width_; }
Expand All @@ -127,7 +147,6 @@ class MailboxSwapChain final : public IDXGISwapChain {
Microsoft::WRL::ComPtr<ID3D11Texture2D> texture;
HANDLE shared_handle = nullptr;
Microsoft::WRL::ComPtr<ID3D11Fence> fence;
HANDLE fence_event = nullptr;
uint64_t fence_value = 0;
};

Expand All @@ -137,13 +156,20 @@ class MailboxSwapChain final : public IDXGISwapChain {
int32_t width_ = 1;
int32_t height_ = 1;

TextureSlot slots_[3];
TextureSlot slots_[4];

// Lock-free mailbox state — see bit-field comment at top of class.
// Initial value 57u = 0b0_11_10_01.
std::atomic<uint32_t> mailbox_state_{57u};

// Lock-free mailbox: bits [1:0] = slot index (0-2), bit [2] = dirty; init = 2u.
std::atomic<uint32_t> mailbox_state_{2u};
// Cache of the most recently fence-confirmed completed slot.
// ConsumerAcquire reads this directly (one atomic load, no CAS, no fence
// poll). Updated by ProducerCommit after a successful non-blocking
// pending→completed promotion. Initialised to 2, which matches the
// 'completed' field in mailbox_state_'s initial value 57u.
std::atomic<int> latest_completed_slot_{2};

int write_slot_ = 0; // producer-private
int read_slot_ = 1; // consumer-private

std::atomic<ULONG> ref_count_{1u};
};
Expand Down
6 changes: 2 additions & 4 deletions media_kit_video/windows/video_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,6 @@ void VideoOutput::Render() {
if (d3d11_renderer_ != nullptr) {
mpv_render_context_render(render_context_, nullptr);
mpv_render_context_report_swap(render_context_);
// Atomically publish the rendered slot to the mailbox so that Flutter's
// GpuSurfaceTexture callback can import it without a copy.
d3d11_renderer_->ProducerCommit();
}
// S/W
Expand Down Expand Up @@ -299,8 +297,8 @@ void VideoOutput::Resize(int64_t required_width, int64_t required_height) {

auto texture = std::make_unique<FlutterDesktopGpuSurfaceDescriptor>();
texture->struct_size = sizeof(FlutterDesktopGpuSurfaceDescriptor);
// Seed with the current read-slot handle so Flutter has a valid surface
// even before the first mpv frame is committed.
// Seed with the latest-completed-slot handle so Flutter has a valid
// surface even before the first mpv frame is committed.
texture->handle = d3d11_renderer_->ReadHandleSnapshot();
texture->width = texture->visible_width = d3d11_renderer_->width();
texture->height = texture->visible_height = d3d11_renderer_->height();
Expand Down
Loading