2 Commits

Author SHA1 Message Date
Aiden
709d3d3fa4 Test works
All checks were successful
CI / React UI Build (push) Successful in 11s
CI / Native Windows Build And Tests (push) Successful in 2m53s
CI / Windows Release Package (push) Successful in 3m1s
2026-05-12 01:30:30 +10:00
Aiden
ea31d0ca13 Clean 2026-05-12 01:21:42 +10:00
13 changed files with 1637 additions and 35 deletions

17
.vscode/launch.json vendored
View File

@@ -61,6 +61,23 @@
"moduleLoad": true
},
"preLaunchTask": "Build LoopThroughWithOpenGLCompositing Debug x64"
},
{
"name": "Debug DeckLinkRenderCadenceProbe",
"type": "cppvsdbg",
"request": "launch",
"program": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug\\DeckLinkRenderCadenceProbe.exe",
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug",
"environment": [],
"console": "externalTerminal",
"symbolSearchPath": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug",
"requireExactSource": true,
"logging": {
"moduleLoad": true
},
"preLaunchTask": "Build DeckLinkRenderCadenceProbe Debug x64"
}
]
}

16
.vscode/tasks.json vendored
View File

@@ -36,6 +36,22 @@
"group": "build",
"problemMatcher": "$msCompile"
},
{
"label": "Build DeckLinkRenderCadenceProbe Debug x64",
"type": "process",
"command": "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\CMake\\CMake\\bin\\cmake.exe",
"args": [
"--build",
"${workspaceFolder}\\build\\vs2022-x64-debug",
"--config",
"Debug",
"--target",
"DeckLinkRenderCadenceProbe",
"--parallel"
],
"group": "build",
"problemMatcher": "$msCompile"
},
{
"label": "Clean LoopThroughWithOpenGLCompositing Debug x64",
"type": "process",

View File

@@ -229,6 +229,50 @@ if(MSVC)
target_compile_options(LoopThroughWithOpenGLCompositing PRIVATE /W3)
endif()
set(PROBE_APP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/apps/DeckLinkRenderCadenceProbe")
add_executable(DeckLinkRenderCadenceProbe
"${APP_DIR}/videoio/decklink/DeckLinkAPI_i.c"
"${APP_DIR}/videoio/decklink/DeckLinkDisplayMode.cpp"
"${APP_DIR}/videoio/decklink/DeckLinkDisplayMode.h"
"${APP_DIR}/videoio/decklink/DeckLinkFrameTransfer.cpp"
"${APP_DIR}/videoio/decklink/DeckLinkFrameTransfer.h"
"${APP_DIR}/videoio/decklink/DeckLinkSession.cpp"
"${APP_DIR}/videoio/decklink/DeckLinkSession.h"
"${APP_DIR}/videoio/decklink/DeckLinkVideoIOFormat.cpp"
"${APP_DIR}/videoio/decklink/DeckLinkVideoIOFormat.h"
"${APP_DIR}/gl/renderer/GLExtensions.cpp"
"${APP_DIR}/gl/renderer/GLExtensions.h"
"${APP_DIR}/videoio/VideoIOFormat.cpp"
"${APP_DIR}/videoio/VideoIOFormat.h"
"${APP_DIR}/videoio/VideoIOTypes.h"
"${APP_DIR}/videoio/VideoPlayoutPolicy.h"
"${APP_DIR}/videoio/VideoPlayoutScheduler.cpp"
"${APP_DIR}/videoio/VideoPlayoutScheduler.h"
"${PROBE_APP_DIR}/DeckLinkRenderCadenceProbe.cpp"
)
target_include_directories(DeckLinkRenderCadenceProbe PRIVATE
"${APP_DIR}"
"${APP_DIR}/gl/renderer"
"${APP_DIR}/videoio"
"${APP_DIR}/videoio/decklink"
)
target_link_libraries(DeckLinkRenderCadenceProbe PRIVATE
opengl32
Ole32
)
target_compile_definitions(DeckLinkRenderCadenceProbe PRIVATE
_UNICODE
UNICODE
)
if(MSVC)
target_compile_options(DeckLinkRenderCadenceProbe PRIVATE /W3)
endif()
add_executable(RuntimeJsonTests
"${APP_DIR}/runtime/support/RuntimeJson.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/tests/RuntimeJsonTests.cpp"

View File

@@ -0,0 +1,920 @@
#include "DeckLinkSession.h"
#include "GLExtensions.h"
#include "VideoIOFormat.h"
#include "VideoPlayoutPolicy.h"
#include <windows.h>
#include <algorithm>
#include <atomic>
#include <chrono>
#include <cmath>
#include <condition_variable>
#include <cstdint>
#include <deque>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
namespace
{
constexpr unsigned kDefaultWidth = 1920;
constexpr unsigned kDefaultHeight = 1080;
constexpr std::size_t kSystemFrameSlots = 12;
constexpr std::size_t kPboDepth = 6;
constexpr std::size_t kWarmupFrames = 4;
constexpr std::size_t kDeckLinkTargetBufferedFrames = 4;
enum class ProbeSlotState
{
Free,
Rendering,
Completed,
Scheduled
};
struct ProbeFrame
{
void* bytes = nullptr;
long rowBytes = 0;
unsigned width = 0;
unsigned height = 0;
VideoIOPixelFormat pixelFormat = VideoIOPixelFormat::Bgra8;
std::size_t index = 0;
uint64_t generation = 0;
uint64_t frameIndex = 0;
};
struct ProbeMetrics
{
uint64_t renderedFrames = 0;
uint64_t completedFrames = 0;
uint64_t scheduledFrames = 0;
uint64_t completedDrops = 0;
uint64_t acquireMisses = 0;
uint64_t scheduleUnderruns = 0;
uint64_t pboQueueMisses = 0;
std::size_t freeCount = 0;
std::size_t renderingCount = 0;
std::size_t completedCount = 0;
std::size_t scheduledCount = 0;
};
class LatestFrameStore
{
public:
LatestFrameStore(unsigned width, unsigned height, std::size_t capacity) :
mWidth(width),
mHeight(height),
mRowBytes(VideoIORowBytes(VideoIOPixelFormat::Bgra8, width))
{
mSlots.resize(capacity);
const std::size_t byteCount = static_cast<std::size_t>(mRowBytes) * static_cast<std::size_t>(mHeight);
for (Slot& slot : mSlots)
{
slot.bytes.resize(byteCount);
slot.generation = 1;
}
}
bool AcquireForRender(ProbeFrame& frame)
{
std::lock_guard<std::mutex> lock(mMutex);
if (!AcquireFreeLocked(frame))
{
if (!DropOldestCompletedLocked() || !AcquireFreeLocked(frame))
{
++mMetrics.acquireMisses;
return false;
}
}
return true;
}
bool PublishCompleted(const ProbeFrame& frame)
{
std::lock_guard<std::mutex> lock(mMutex);
if (!IsValidLocked(frame))
return false;
Slot& slot = mSlots[frame.index];
if (slot.state != ProbeSlotState::Rendering)
return false;
slot.state = ProbeSlotState::Completed;
slot.frameIndex = frame.frameIndex;
mCompletedIndices.push_back(frame.index);
++mMetrics.completedFrames;
mCondition.notify_all();
return true;
}
bool ConsumeCompleted(ProbeFrame& frame)
{
std::lock_guard<std::mutex> lock(mMutex);
while (!mCompletedIndices.empty())
{
const std::size_t index = mCompletedIndices.front();
mCompletedIndices.pop_front();
if (index >= mSlots.size() || mSlots[index].state != ProbeSlotState::Completed)
continue;
mSlots[index].state = ProbeSlotState::Scheduled;
FillFrameLocked(index, frame);
++mMetrics.scheduledFrames;
return true;
}
++mMetrics.scheduleUnderruns;
return false;
}
bool ReleaseByBytes(void* bytes)
{
if (bytes == nullptr)
return false;
std::lock_guard<std::mutex> lock(mMutex);
for (std::size_t index = 0; index < mSlots.size(); ++index)
{
if (mSlots[index].bytes.data() != bytes)
continue;
mSlots[index].state = ProbeSlotState::Free;
++mSlots[index].generation;
RemoveCompletedIndexLocked(index);
mCondition.notify_all();
return true;
}
return false;
}
bool WaitForCompletedDepth(std::size_t targetDepth, std::chrono::milliseconds timeout)
{
std::unique_lock<std::mutex> lock(mMutex);
return mCondition.wait_for(lock, timeout, [&]() {
return CompletedCountLocked() >= targetDepth;
});
}
ProbeMetrics Metrics() const
{
std::lock_guard<std::mutex> lock(mMutex);
ProbeMetrics metrics = mMetrics;
for (const Slot& slot : mSlots)
{
switch (slot.state)
{
case ProbeSlotState::Free:
++metrics.freeCount;
break;
case ProbeSlotState::Rendering:
++metrics.renderingCount;
break;
case ProbeSlotState::Completed:
++metrics.completedCount;
break;
case ProbeSlotState::Scheduled:
++metrics.scheduledCount;
break;
}
}
return metrics;
}
void CountRenderedFrame()
{
std::lock_guard<std::mutex> lock(mMutex);
++mMetrics.renderedFrames;
}
void CountPboQueueMiss()
{
std::lock_guard<std::mutex> lock(mMutex);
++mMetrics.pboQueueMisses;
}
private:
struct Slot
{
std::vector<unsigned char> bytes;
ProbeSlotState state = ProbeSlotState::Free;
uint64_t generation = 1;
uint64_t frameIndex = 0;
};
bool AcquireFreeLocked(ProbeFrame& frame)
{
for (std::size_t index = 0; index < mSlots.size(); ++index)
{
if (mSlots[index].state != ProbeSlotState::Free)
continue;
mSlots[index].state = ProbeSlotState::Rendering;
++mSlots[index].generation;
FillFrameLocked(index, frame);
return true;
}
return false;
}
bool DropOldestCompletedLocked()
{
while (!mCompletedIndices.empty())
{
const std::size_t index = mCompletedIndices.front();
mCompletedIndices.pop_front();
if (index >= mSlots.size() || mSlots[index].state != ProbeSlotState::Completed)
continue;
mSlots[index].state = ProbeSlotState::Free;
++mSlots[index].generation;
++mMetrics.completedDrops;
return true;
}
return false;
}
void FillFrameLocked(std::size_t index, ProbeFrame& frame) const
{
const Slot& slot = mSlots[index];
frame.bytes = const_cast<unsigned char*>(slot.bytes.data());
frame.rowBytes = static_cast<long>(mRowBytes);
frame.width = mWidth;
frame.height = mHeight;
frame.pixelFormat = VideoIOPixelFormat::Bgra8;
frame.index = index;
frame.generation = slot.generation;
frame.frameIndex = slot.frameIndex;
}
bool IsValidLocked(const ProbeFrame& frame) const
{
return frame.index < mSlots.size() && mSlots[frame.index].generation == frame.generation;
}
void RemoveCompletedIndexLocked(std::size_t index)
{
mCompletedIndices.erase(std::remove(mCompletedIndices.begin(), mCompletedIndices.end(), index), mCompletedIndices.end());
}
std::size_t CompletedCountLocked() const
{
std::size_t count = 0;
for (const Slot& slot : mSlots)
{
if (slot.state == ProbeSlotState::Completed)
++count;
}
return count;
}
unsigned mWidth = 0;
unsigned mHeight = 0;
unsigned mRowBytes = 0;
std::vector<Slot> mSlots;
std::deque<std::size_t> mCompletedIndices;
mutable std::mutex mMutex;
std::condition_variable mCondition;
ProbeMetrics mMetrics;
};
LRESULT CALLBACK ProbeWindowProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam)
{
return DefWindowProc(hwnd, message, wParam, lParam);
}
class HiddenOpenGLContext
{
public:
~HiddenOpenGLContext()
{
Destroy();
}
bool Create(unsigned width, unsigned height, std::string& error)
{
mInstance = GetModuleHandle(nullptr);
WNDCLASSA wc = {};
wc.style = CS_OWNDC;
wc.lpfnWndProc = ProbeWindowProc;
wc.hInstance = mInstance;
wc.lpszClassName = "DeckLinkRenderCadenceProbeWindow";
RegisterClassA(&wc);
mWindow = CreateWindowA(
wc.lpszClassName,
"DeckLink Render Cadence Probe",
WS_OVERLAPPEDWINDOW,
CW_USEDEFAULT,
CW_USEDEFAULT,
static_cast<int>(width),
static_cast<int>(height),
nullptr,
nullptr,
mInstance,
nullptr);
if (!mWindow)
{
error = "CreateWindowA failed.";
return false;
}
mDc = GetDC(mWindow);
if (!mDc)
{
error = "GetDC failed.";
return false;
}
PIXELFORMATDESCRIPTOR pfd = {};
pfd.nSize = sizeof(pfd);
pfd.nVersion = 1;
pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
pfd.iPixelType = PFD_TYPE_RGBA;
pfd.cColorBits = 32;
pfd.cDepthBits = 0;
pfd.iLayerType = PFD_MAIN_PLANE;
const int pixelFormat = ChoosePixelFormat(mDc, &pfd);
if (pixelFormat == 0 || !SetPixelFormat(mDc, pixelFormat, &pfd))
{
error = "Could not choose/set a pixel format.";
return false;
}
mGlrc = wglCreateContext(mDc);
if (!mGlrc)
{
error = "wglCreateContext failed.";
return false;
}
return true;
}
bool MakeCurrent()
{
return mDc && mGlrc && wglMakeCurrent(mDc, mGlrc);
}
void ClearCurrent()
{
wglMakeCurrent(nullptr, nullptr);
}
void Destroy()
{
ClearCurrent();
if (mGlrc)
{
wglDeleteContext(mGlrc);
mGlrc = nullptr;
}
if (mWindow && mDc)
{
ReleaseDC(mWindow, mDc);
mDc = nullptr;
}
if (mWindow)
{
DestroyWindow(mWindow);
mWindow = nullptr;
}
}
private:
HINSTANCE mInstance = nullptr;
HWND mWindow = nullptr;
HDC mDc = nullptr;
HGLRC mGlrc = nullptr;
};
class RenderCadenceProbe
{
public:
RenderCadenceProbe(LatestFrameStore& frameStore, unsigned width, unsigned height, double frameDurationMs) :
mFrameStore(frameStore),
mWidth(width),
mHeight(height),
mFrameDuration(std::chrono::duration_cast<Clock::duration>(std::chrono::duration<double, std::milli>(frameDurationMs)))
{
if (mFrameDuration <= Clock::duration::zero())
mFrameDuration = std::chrono::milliseconds(16);
}
bool Start(std::string& error)
{
mStopping = false;
mThread = std::thread([this]() { ThreadMain(); });
std::unique_lock<std::mutex> lock(mStartupMutex);
if (!mStartupCondition.wait_for(lock, std::chrono::seconds(3), [this]() { return mStarted || !mStartupError.empty(); }))
{
error = "Timed out starting render thread.";
return false;
}
if (!mStartupError.empty())
{
error = mStartupError;
return false;
}
return true;
}
void Stop()
{
mStopping = true;
if (mThread.joinable())
mThread.join();
}
private:
struct PboSlot
{
GLuint pbo = 0;
GLsync fence = nullptr;
bool inFlight = false;
uint64_t frameIndex = 0;
};
using Clock = std::chrono::steady_clock;
void ThreadMain()
{
std::string error;
HiddenOpenGLContext context;
if (!context.Create(mWidth, mHeight, error) || !context.MakeCurrent())
{
SignalStartupFailure(error.empty() ? "OpenGL context creation failed." : error);
return;
}
if (!ResolveGLExtensions())
{
SignalStartupFailure("OpenGL extension resolution failed.");
return;
}
if (!CreateRenderTargets())
{
SignalStartupFailure("OpenGL render target creation failed.");
return;
}
CreatePbos();
SignalStarted();
auto nextRenderTime = Clock::now();
while (!mStopping)
{
ConsumeCompletedPbos();
const auto now = Clock::now();
if (now < nextRenderTime)
{
std::this_thread::sleep_for((std::min)(std::chrono::milliseconds(1), std::chrono::duration_cast<std::chrono::milliseconds>(nextRenderTime - now)));
continue;
}
RenderPattern(mFrameIndex);
if (!QueueReadback(mFrameIndex))
mFrameStore.CountPboQueueMiss();
mFrameStore.CountRenderedFrame();
++mFrameIndex;
nextRenderTime += mFrameDuration;
if (Clock::now() - nextRenderTime > mFrameDuration * 4)
nextRenderTime = Clock::now() + mFrameDuration;
}
FlushPbos();
DestroyPbos();
DestroyRenderTargets();
context.ClearCurrent();
}
bool CreateRenderTargets()
{
glGenFramebuffers(1, &mFramebuffer);
glBindFramebuffer(GL_FRAMEBUFFER, mFramebuffer);
glGenTextures(1, &mTexture);
glBindTexture(GL_TEXTURE_2D, mTexture);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, static_cast<GLsizei>(mWidth), static_cast<GLsizei>(mHeight), 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, nullptr);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, mTexture, 0);
const bool complete = glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE;
glBindTexture(GL_TEXTURE_2D, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
return complete;
}
void DestroyRenderTargets()
{
if (mFramebuffer != 0)
glDeleteFramebuffers(1, &mFramebuffer);
if (mTexture != 0)
glDeleteTextures(1, &mTexture);
mFramebuffer = 0;
mTexture = 0;
}
void CreatePbos()
{
mPbos.resize(kPboDepth);
const std::size_t byteCount = static_cast<std::size_t>(VideoIORowBytes(VideoIOPixelFormat::Bgra8, mWidth)) * mHeight;
for (PboSlot& slot : mPbos)
{
glGenBuffers(1, &slot.pbo);
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pbo);
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(byteCount), nullptr, GL_STREAM_READ);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
void DestroyPbos()
{
for (PboSlot& slot : mPbos)
{
if (slot.fence)
glDeleteSync(slot.fence);
if (slot.pbo != 0)
glDeleteBuffers(1, &slot.pbo);
slot = {};
}
mPbos.clear();
}
void FlushPbos()
{
for (std::size_t i = 0; i < mPbos.size() * 2; ++i)
ConsumeCompletedPbos();
}
void RenderPattern(uint64_t frameIndex)
{
const float t = static_cast<float>(frameIndex) / 60.0f;
const float red = 0.1f + 0.4f * (0.5f + 0.5f * std::sin(t));
const float green = 0.1f + 0.4f * (0.5f + 0.5f * std::sin(t * 0.73f + 1.0f));
const float blue = 0.15f + 0.3f * (0.5f + 0.5f * std::sin(t * 0.41f + 2.0f));
glBindFramebuffer(GL_FRAMEBUFFER, mFramebuffer);
glViewport(0, 0, static_cast<GLsizei>(mWidth), static_cast<GLsizei>(mHeight));
glDisable(GL_SCISSOR_TEST);
glClearColor(red, green, blue, 1.0f);
glClear(GL_COLOR_BUFFER_BIT);
const int boxWidth = static_cast<int>(mWidth / 6);
const int boxHeight = static_cast<int>(mHeight / 5);
const float phase = 0.5f + 0.5f * std::sin(t * 1.7f);
const int x = static_cast<int>(phase * static_cast<float>(mWidth - boxWidth));
const int y = static_cast<int>((0.5f + 0.5f * std::sin(t * 1.1f + 0.8f)) * static_cast<float>(mHeight - boxHeight));
glEnable(GL_SCISSOR_TEST);
glScissor(x, y, boxWidth, boxHeight);
glClearColor(1.0f - red, 0.85f, 0.15f + blue, 1.0f);
glClear(GL_COLOR_BUFFER_BIT);
glDisable(GL_SCISSOR_TEST);
}
bool QueueReadback(uint64_t frameIndex)
{
if (mPbos.empty())
return false;
PboSlot& slot = mPbos[mWriteIndex];
if (slot.inFlight)
return false;
const std::size_t byteCount = static_cast<std::size_t>(VideoIORowBytes(VideoIOPixelFormat::Bgra8, mWidth)) * mHeight;
glBindFramebuffer(GL_READ_FRAMEBUFFER, mFramebuffer);
glPixelStorei(GL_PACK_ALIGNMENT, 4);
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pbo);
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(byteCount), nullptr, GL_STREAM_READ);
glReadPixels(0, 0, static_cast<GLsizei>(mWidth), static_cast<GLsizei>(mHeight), GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, nullptr);
slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
slot.inFlight = slot.fence != nullptr;
slot.frameIndex = frameIndex;
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
mWriteIndex = (mWriteIndex + 1) % mPbos.size();
return slot.inFlight;
}
void ConsumeCompletedPbos()
{
for (std::size_t checked = 0; checked < mPbos.size(); ++checked)
{
PboSlot& slot = mPbos[mReadIndex];
if (!slot.inFlight || slot.fence == nullptr)
{
mReadIndex = (mReadIndex + 1) % mPbos.size();
continue;
}
const GLenum waitResult = glClientWaitSync(slot.fence, 0, 0);
if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED)
return;
ProbeFrame frame;
if (mFrameStore.AcquireForRender(frame))
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pbo);
void* mapped = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (mapped)
{
const std::size_t byteCount = static_cast<std::size_t>(frame.rowBytes) * frame.height;
std::memcpy(frame.bytes, mapped, byteCount);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
frame.frameIndex = slot.frameIndex;
mFrameStore.PublishCompleted(frame);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
glDeleteSync(slot.fence);
slot.fence = nullptr;
slot.inFlight = false;
mReadIndex = (mReadIndex + 1) % mPbos.size();
}
}
void SignalStarted()
{
std::lock_guard<std::mutex> lock(mStartupMutex);
mStarted = true;
mStartupCondition.notify_all();
}
void SignalStartupFailure(const std::string& error)
{
std::lock_guard<std::mutex> lock(mStartupMutex);
mStartupError = error;
mStartupCondition.notify_all();
}
LatestFrameStore& mFrameStore;
unsigned mWidth = 0;
unsigned mHeight = 0;
Clock::duration mFrameDuration;
std::thread mThread;
std::atomic<bool> mStopping{ false };
std::mutex mStartupMutex;
std::condition_variable mStartupCondition;
bool mStarted = false;
std::string mStartupError;
GLuint mFramebuffer = 0;
GLuint mTexture = 0;
std::vector<PboSlot> mPbos;
std::size_t mWriteIndex = 0;
std::size_t mReadIndex = 0;
uint64_t mFrameIndex = 0;
};
class DeckLinkProbePlayout
{
public:
DeckLinkProbePlayout(DeckLinkSession& session, LatestFrameStore& frameStore) :
mSession(session),
mFrameStore(frameStore)
{
}
bool Start()
{
mStopping = false;
mThread = std::thread([this]() { ThreadMain(); });
return true;
}
void Stop()
{
mStopping = true;
if (mThread.joinable())
mThread.join();
}
void ThreadMain()
{
while (!mStopping)
{
const ProbeMetrics metrics = mFrameStore.Metrics();
if (metrics.scheduledCount >= kDeckLinkTargetBufferedFrames)
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
continue;
}
ProbeFrame frame;
if (!mFrameStore.ConsumeCompleted(frame))
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
continue;
}
VideoIOOutputFrame outputFrame;
outputFrame.bytes = frame.bytes;
outputFrame.nativeBuffer = frame.bytes;
outputFrame.rowBytes = frame.rowBytes;
outputFrame.width = frame.width;
outputFrame.height = frame.height;
outputFrame.pixelFormat = frame.pixelFormat;
if (!mSession.ScheduleOutputFrame(outputFrame))
{
mFrameStore.ReleaseByBytes(frame.bytes);
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
}
}
private:
DeckLinkSession& mSession;
LatestFrameStore& mFrameStore;
std::thread mThread;
std::atomic<bool> mStopping{ false };
};
std::string CompletionResultToString(VideoIOCompletionResult result)
{
switch (result)
{
case VideoIOCompletionResult::Completed:
return "completed";
case VideoIOCompletionResult::DisplayedLate:
return "late";
case VideoIOCompletionResult::Dropped:
return "dropped";
case VideoIOCompletionResult::Flushed:
return "flushed";
case VideoIOCompletionResult::Unknown:
default:
return "unknown";
}
}
void PrintUsage()
{
std::cout << "DeckLinkRenderCadenceProbe\n"
<< " Renders a simple OpenGL BGRA8 motion pattern on one GL thread,\n"
<< " copies completed PBO readbacks into latest-N system memory slots,\n"
<< " warms up rendered frames, then feeds DeckLink scheduled playback.\n\n"
<< "Press Enter to stop.\n";
}
class ComInitGuard
{
public:
~ComInitGuard()
{
if (mInitialized)
CoUninitialize();
}
bool Initialize()
{
const HRESULT result = CoInitialize(nullptr);
mInitialized = SUCCEEDED(result);
mResult = result;
return mInitialized;
}
HRESULT Result() const { return mResult; }
private:
bool mInitialized = false;
HRESULT mResult = S_OK;
};
}
int main()
{
PrintUsage();
ComInitGuard com;
if (!com.Initialize())
{
std::cerr << "COM initialization failed: 0x" << std::hex << com.Result() << std::dec << "\n";
return 1;
}
LatestFrameStore frameStore(kDefaultWidth, kDefaultHeight, kSystemFrameSlots);
DeckLinkSession deckLink;
std::atomic<uint64_t> completions{ 0 };
std::atomic<uint64_t> late{ 0 };
std::atomic<uint64_t> dropped{ 0 };
VideoFormatSelection formats;
std::string error;
if (!deckLink.DiscoverDevicesAndModes(formats, error))
{
std::cerr << "DeckLink discovery failed: " << error << "\n";
return 1;
}
if (!deckLink.SelectPreferredFormats(formats, false, error))
{
std::cerr << "DeckLink format selection failed: " << error << "\n";
return 1;
}
if (!deckLink.ConfigureOutput(
[&](const VideoIOCompletion& completion) {
frameStore.ReleaseByBytes(completion.outputFrameBuffer);
++completions;
if (completion.result == VideoIOCompletionResult::DisplayedLate)
++late;
else if (completion.result == VideoIOCompletionResult::Dropped)
++dropped;
},
formats.output,
false,
error))
{
std::cerr << "DeckLink output configuration failed: " << error << "\n";
return 1;
}
if (!deckLink.PrepareOutputSchedule())
{
std::cerr << "DeckLink schedule preparation failed.\n";
return 1;
}
const VideoIOState& state = deckLink.State();
if (state.outputFrameSize.width != kDefaultWidth || state.outputFrameSize.height != kDefaultHeight)
{
std::cerr << "This probe currently expects 1920x1080 output. Selected mode is "
<< state.outputFrameSize.width << "x" << state.outputFrameSize.height << ".\n";
return 1;
}
RenderCadenceProbe renderer(frameStore, state.outputFrameSize.width, state.outputFrameSize.height, state.frameBudgetMilliseconds);
if (!renderer.Start(error))
{
std::cerr << "Render thread start failed: " << error << "\n";
return 1;
}
std::cout << "Warming up " << kWarmupFrames << " rendered frames at cadence...\n";
if (!frameStore.WaitForCompletedDepth(kWarmupFrames, std::chrono::seconds(3)))
{
std::cerr << "Timed out waiting for rendered warmup frames.\n";
renderer.Stop();
return 1;
}
DeckLinkProbePlayout playout(deckLink, frameStore);
playout.Start();
const auto prerollDeadline = std::chrono::steady_clock::now() + std::chrono::seconds(3);
while (std::chrono::steady_clock::now() < prerollDeadline)
{
if (frameStore.Metrics().scheduledCount >= kDeckLinkTargetBufferedFrames)
break;
std::this_thread::sleep_for(std::chrono::milliseconds(2));
}
if (!deckLink.StartScheduledPlayback())
{
std::cerr << "DeckLink scheduled playback failed to start.\n";
playout.Stop();
renderer.Stop();
return 1;
}
std::atomic<bool> metricsStopping{ false };
std::thread metricsThread([&]() {
uint64_t lastRendered = 0;
uint64_t lastScheduled = 0;
auto lastTime = std::chrono::steady_clock::now();
while (!metricsStopping)
{
std::this_thread::sleep_for(std::chrono::seconds(1));
const auto now = std::chrono::steady_clock::now();
const double seconds = std::chrono::duration_cast<std::chrono::duration<double>>(now - lastTime).count();
const ProbeMetrics metrics = frameStore.Metrics();
const double renderFps = seconds > 0.0 ? static_cast<double>(metrics.renderedFrames - lastRendered) / seconds : 0.0;
const double scheduleFps = seconds > 0.0 ? static_cast<double>(metrics.scheduledFrames - lastScheduled) / seconds : 0.0;
lastRendered = metrics.renderedFrames;
lastScheduled = metrics.scheduledFrames;
lastTime = now;
std::cout << std::fixed << std::setprecision(1)
<< "renderFps=" << renderFps
<< " scheduleFps=" << scheduleFps
<< " free=" << metrics.freeCount
<< " completed=" << metrics.completedCount
<< " scheduled=" << metrics.scheduledCount
<< " drops=" << metrics.completedDrops
<< " pboMiss=" << metrics.pboQueueMisses
<< " completions=" << completions.load()
<< " late=" << late.load()
<< " dropped=" << dropped.load()
<< " decklinkBuffered=" << deckLink.State().actualDeckLinkBufferedFrames
<< "\n";
}
});
std::string line;
std::getline(std::cin, line);
metricsStopping = true;
if (metricsThread.joinable())
metricsThread.join();
playout.Stop();
deckLink.Stop();
renderer.Stop();
deckLink.ReleaseResources();
return 0;
}

View File

@@ -0,0 +1,113 @@
# DeckLink Render Cadence Probe
This is a deliberately small architecture probe for the Phase 7.7 playout model.
It is not the main app and does not use the main runtime, shader stack, preview path, input upload path, or render engine.
## What It Tests
The probe validates the clean playout spine:
```text
single OpenGL render thread
owns its own hidden GL context
renders a simple moving BGRA8 pattern at output cadence
queues GPU readback through a PBO ring
copies completed readbacks into latest-N system-memory slots
system-memory frame store
owns free / rendering / completed / scheduled slots
drops old completed unscheduled frames when render cadence needs space
protects scheduled frames until DeckLink completion
DeckLink playout thread
consumes completed system-memory frames
keeps a small scheduled buffer filled
does not render
```
Startup warms up rendered frames before starting DeckLink scheduled playback.
## How To Build
```powershell
cmake --build --preset build-debug --target DeckLinkRenderCadenceProbe -- /m:1
```
The executable is:
```text
build\vs2022-x64-debug\Debug\DeckLinkRenderCadenceProbe.exe
```
## How To Run
Run it from a terminal so you can see the telemetry:
```powershell
build\vs2022-x64-debug\Debug\DeckLinkRenderCadenceProbe.exe
```
Press Enter to stop.
The first version assumes `1080p59.94` / `1920x1080` output and BGRA8 system-memory frames.
## What To Watch
The probe prints one line per second:
- `renderFps`: cadence render throughput
- `scheduleFps`: DeckLink scheduling throughput
- `free`: free system-memory slots
- `completed`: rendered, unscheduled slots
- `scheduled`: slots currently owned by DeckLink
- `drops`: old completed unscheduled frames recycled by the latest-N cache
- `pboMiss`: PBO ring was full when trying to queue readback
- `late`: DeckLink displayed-late completions
- `dropped`: DeckLink dropped completions
- `decklinkBuffered`: actual DeckLink buffered-frame count when available
For a healthy architecture proof, expect:
- `renderFps` close to the selected output cadence
- `scheduleFps` close to the selected output cadence after warmup
- `scheduled` hovering near the target buffer depth
- `late` and `dropped` not increasing continuously
- visible motion that is smooth on the DeckLink output
## Interpretation
If this probe is smooth at 59.94/60, the broad architecture is viable and the main app's remaining stutters are likely caused by integration details such as input upload, shared render-thread work, preview/screenshot work, or runtime/render-state coupling.
If this probe is not smooth, the problem is lower level: DeckLink scheduling, OpenGL readback, Windows scheduling, or hardware/driver behavior.
## Initial Result
Date: 2026-05-12
User-visible result:
- output looked smooth
Representative telemetry:
```text
renderFps=59.9 scheduleFps=59.9 free=7 completed=1 scheduled=4 drops=0 pboMiss=0 completions=119 late=0 dropped=0 decklinkBuffered=4
renderFps=59.9 scheduleFps=59.9 free=7 completed=1 scheduled=4 drops=0 pboMiss=0 completions=179 late=0 dropped=0 decklinkBuffered=4
renderFps=59.8 scheduleFps=59.8 free=7 completed=1 scheduled=4 drops=0 pboMiss=0 completions=239 late=0 dropped=0 decklinkBuffered=4
renderFps=60.8 scheduleFps=59.8 free=7 completed=1 scheduled=4 drops=0 pboMiss=0 completions=299 late=0 dropped=0 decklinkBuffered=4
renderFps=59.9 scheduleFps=59.9 free=7 completed=1 scheduled=4 drops=0 pboMiss=0 completions=360 late=0 dropped=0 decklinkBuffered=4
renderFps=59.8 scheduleFps=60.8 free=8 completed=0 scheduled=4 drops=0 pboMiss=0 completions=420 late=0 dropped=0 decklinkBuffered=4
```
Read:
- the clean architecture can sustain the selected output cadence on the test machine
- BGRA8 PBO readback is viable when isolated from the main app's other render-thread work
- latest-N system-memory buffering stayed stable
- DeckLink actual buffered depth stayed at 4
- there were no late frames, dropped frames, completed-frame drops, or PBO misses in the sampled output
Implication:
The main app's remaining stutters are likely integration/ownership issues rather than a fundamental DeckLink/OpenGL/BGRA8 readback limit. The highest-value suspects are input upload before output render, shared render-thread queue contention, preview/screenshot work, and runtime/render-state work on the output path.

View File

@@ -110,19 +110,42 @@ bool VideoBackend::ConfigureOutput(const VideoFormat& outputVideoMode, bool exte
bool VideoBackend::Start()
{
ApplyLifecycleTransition(VideoBackendLifecycleState::Prerolling, "Video backend preroll starting.");
if (!mVideoIODevice->PrepareOutputSchedule())
{
ApplyLifecycleFailure(StatusMessage().empty() ? "Video backend output schedule preparation failed." : StatusMessage());
return false;
}
StartOutputCompletionWorker();
const bool started = mVideoIODevice->Start();
if (started)
{
StartOutputProducerWorker();
ApplyLifecycleTransition(VideoBackendLifecycleState::Running, "Video backend started.");
}
else
if (!WarmupOutputPreroll())
{
StopOutputProducerWorker();
StopOutputCompletionWorker();
ApplyLifecycleFailure(StatusMessage().empty() ? "Video backend start failed." : StatusMessage());
ApplyLifecycleFailure(StatusMessage().empty() ? "Video backend preroll warmup failed." : StatusMessage());
return false;
}
return started;
if (!mVideoIODevice->StartInputStreams())
{
StopOutputProducerWorker();
StopOutputCompletionWorker();
ApplyLifecycleFailure(StatusMessage().empty() ? "Video backend input stream start failed." : StatusMessage());
return false;
}
if (!mVideoIODevice->StartScheduledPlayback())
{
StopOutputProducerWorker();
mVideoIODevice->Stop();
StopOutputCompletionWorker();
ApplyLifecycleFailure(StatusMessage().empty() ? "Video backend scheduled playback start failed." : StatusMessage());
return false;
}
ApplyLifecycleTransition(VideoBackendLifecycleState::Running, "Video backend started.");
return true;
}
bool VideoBackend::Stop()
@@ -393,6 +416,39 @@ void VideoBackend::NotifyOutputProducer()
mOutputProducerCondition.notify_one();
}
bool VideoBackend::WarmupOutputPreroll()
{
const VideoPlayoutPolicy policy = NormalizeVideoPlayoutPolicy(mPlayoutPolicy);
const std::size_t targetPrerollFrames = static_cast<std::size_t>(policy.targetPrerollFrames);
if (targetPrerollFrames == 0)
return true;
const double frameBudgetMilliseconds = State().frameBudgetMilliseconds > 0.0 ? State().frameBudgetMilliseconds : 16.0;
const auto estimatedCadenceTime = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::duration<double, std::milli>(frameBudgetMilliseconds * static_cast<double>(targetPrerollFrames + 2)));
const auto timeout = (std::max)(std::chrono::milliseconds(1000), estimatedCadenceTime + std::chrono::milliseconds(500));
const auto deadline = std::chrono::steady_clock::now() + timeout;
while (std::chrono::steady_clock::now() < deadline)
{
ScheduleReadyOutputFramesToTarget();
const SystemOutputFramePoolMetrics metrics = mSystemOutputFramePool.GetMetrics();
RecordSystemMemoryPlayoutStats();
if (metrics.scheduledCount >= targetPrerollFrames)
return true;
NotifyOutputProducer();
const auto waitDuration = (std::min)(OutputProducerWakeInterval(), std::chrono::milliseconds(5));
std::unique_lock<std::mutex> lock(mOutputProducerMutex);
mOutputProducerCondition.wait_for(lock, waitDuration);
if (mOutputProducerWorkerStopping)
return false;
}
SetStatusMessage("Timed out warming up DeckLink preroll from rendered system-memory frames.");
return false;
}
void VideoBackend::OutputCompletionWorkerMain()
{
for (;;)

View File

@@ -77,6 +77,7 @@ private:
void StopOutputProducerWorker();
void OutputProducerWorkerMain();
void NotifyOutputProducer();
bool WarmupOutputPreroll();
std::chrono::milliseconds OutputProducerWakeInterval() const;
void ProcessOutputFrameCompletion(const VideoIOCompletion& completion);
std::size_t ProduceReadyOutputFrames(const VideoIOCompletion& completion, std::size_t maxFrames);

View File

@@ -116,6 +116,9 @@ public:
virtual bool SelectPreferredFormats(const VideoFormatSelection& videoModes, bool outputAlphaRequired, std::string& error) = 0;
virtual bool ConfigureInput(InputFrameCallback callback, const VideoFormat& inputVideoMode, std::string& error) = 0;
virtual bool ConfigureOutput(OutputFrameCallback callback, const VideoFormat& outputVideoMode, bool externalKeyingEnabled, std::string& error) = 0;
virtual bool PrepareOutputSchedule() = 0;
virtual bool StartInputStreams() = 0;
virtual bool StartScheduledPlayback() = 0;
virtual bool Start() = 0;
virtual bool Stop() = 0;
virtual const VideoIOState& State() const = 0;

View File

@@ -660,9 +660,45 @@ bool DeckLinkSession::ScheduleOutputFrame(const VideoIOOutputFrame& frame)
return scheduled;
}
bool DeckLinkSession::Start()
bool DeckLinkSession::PrepareOutputSchedule()
{
mScheduler.Reset();
RefreshBufferedVideoFrameCount();
return output != nullptr;
}
bool DeckLinkSession::StartInputStreams()
{
if (!input)
return true;
if (input->StartStreams() != S_OK)
{
MessageBoxA(NULL, "Could not start the DeckLink input stream.", "DeckLink start failed", MB_OK | MB_ICONERROR);
return false;
}
return true;
}
bool DeckLinkSession::StartScheduledPlayback()
{
if (!output)
{
MessageBoxA(NULL, "Cannot start playout because no DeckLink output device is available.", "DeckLink start failed", MB_OK | MB_ICONERROR);
return false;
}
if (output->StartScheduledPlayback(0, mScheduler.TimeScale(), 1.0) != S_OK)
{
MessageBoxA(NULL, "Could not start DeckLink scheduled playback.", "DeckLink start failed", MB_OK | MB_ICONERROR);
return false;
}
RefreshBufferedVideoFrameCount();
return true;
}
bool DeckLinkSession::Start()
{
if (!output)
{
MessageBoxA(NULL, "Cannot start playout because no DeckLink output device is available.", "DeckLink start failed", MB_OK | MB_ICONERROR);
@@ -676,6 +712,9 @@ bool DeckLinkSession::Start()
const VideoPlayoutPolicy policy = NormalizeVideoPlayoutPolicy(mPlayoutPolicy);
mPlayoutPolicy = policy;
if (!PrepareOutputSchedule())
return false;
for (unsigned i = 0; i < policy.targetPrerollFrames; i++)
{
CComPtr<IDeckLinkMutableVideoFrame> outputVideoFrame;
@@ -691,21 +730,7 @@ bool DeckLinkSession::Start()
}
}
if (input)
{
if (input->StartStreams() != S_OK)
{
MessageBoxA(NULL, "Could not start the DeckLink input stream.", "DeckLink start failed", MB_OK | MB_ICONERROR);
return false;
}
}
if (output->StartScheduledPlayback(0, mScheduler.TimeScale(), 1.0) != S_OK)
{
MessageBoxA(NULL, "Could not start DeckLink scheduled playback.", "DeckLink start failed", MB_OK | MB_ICONERROR);
return false;
}
return true;
return StartInputStreams() && StartScheduledPlayback();
}
bool DeckLinkSession::Stop()

View File

@@ -28,6 +28,9 @@ public:
bool SelectPreferredFormats(const VideoFormatSelection& videoModes, bool outputAlphaRequired, std::string& error) override;
bool ConfigureInput(InputFrameCallback callback, const VideoFormat& inputVideoMode, std::string& error) override;
bool ConfigureOutput(OutputFrameCallback callback, const VideoFormat& outputVideoMode, bool externalKeyingEnabled, std::string& error) override;
bool PrepareOutputSchedule() override;
bool StartInputStreams() override;
bool StartScheduledPlayback() override;
bool Start() override;
bool Stop() override;

View File

@@ -0,0 +1,377 @@
# DeckLink / OpenGL Lessons Learned
This document summarizes the practical lessons from the Phase 3-7.7 refactor work, especially the DeckLink playout timing experiments.
It is intentionally broader than the phase design docs. The goal is to preserve what we now know about the system so future architecture choices start from evidence instead of rediscovering the same constraints.
## High-Level Lesson
The application is not just a renderer with a video output attached.
It is a real-time playout system with several independent clocks:
- the selected output cadence, for example 59.94 fps
- the GPU render/readback timeline
- the DeckLink scheduled playback clock
- the Windows thread scheduler
- the input capture callback cadence
- the preview/window message loop
- the runtime/control update cadence
Stable playback depends on assigning one owner to each timing domain and keeping those domains loosely coupled.
## What Worked
### Named State Contracts Helped
`RenderFrameInput` and `RenderFrameState` made the render path easier to reason about.
Before that, frame rendering depended on scattered choices about snapshots, cache state, layer state, input source state, and runtime service state. Naming the frame contract made it possible to move logic out of `RenderEngine` and toward explicit frame construction.
Lesson:
- keep frame inputs explicit
- keep render-frame state immutable for the duration of a frame
- avoid making the renderer ask global systems which state it should use mid-frame
### Render-Thread Ownership Helped
Moving GL work behind a render-thread boundary reduced wrong-thread GL access risk and made ownership clearer.
The current render thread is still shared by output render, input upload, preview, screenshot, resize, and reset work, so it is not yet a pure output cadence thread. But the ownership direction is right.
Lesson:
- GL context ownership should be explicit
- public methods should enqueue or request work
- render-thread methods should own GL bodies
- synchronous calls should be reserved for places that genuinely need a result
### Background Persistence Was Worth It
Moving persistence away from hot render/control paths reduced incidental latency risk and made state writes easier to reason about.
Lesson:
- runtime/control persistence should not sit on output render timing
- shutdown flushing is fine, steady-state blocking is not
### Lifecycle State Was Worth It
The backend lifecycle model gave us better failure and shutdown vocabulary.
This became important once startup stopped being a single `Start()` call and became:
- prepare output schedule
- start render cadence
- warm up real frames
- start input streams
- start scheduled playback
Lesson:
- playout startup needs phases
- degradation should be explicit
- shutdown order should be deliberate and testable
## What Did Not Work
### Completion-Driven Rendering Was Too Fragile
Rendering on or near DeckLink completion can average the target frame rate, but it leaves no headroom.
When the callback asks for a frame just-in-time, any small delay in render, readback, scheduling, or Windows wake timing becomes visible as a buffer dip or stutter.
Lesson:
- DeckLink completion should release scheduled resources and wake scheduling
- it should not render
- it should not decide visual fallback policy in steady state
### Black Fallback Hid The Real Timing Problem
Scheduling black on app-ready underrun made the pipeline appear to keep moving while producing visible black flicker.
It also made diagnosis harder because DeckLink could have scheduled frames while the app visibly failed.
Lesson:
- black is a startup/error/degraded-state policy, not normal steady-state recovery
- steady-state underruns should be measured as timing failures
### Synthetic Schedule Lead Was Misleading
The synthetic scheduled/completed index could report a large buffer while DeckLink still showed low actual device buffer depth.
Real DeckLink `GetBufferedVideoFrameCount()` telemetry was necessary to separate:
- app-owned scheduled slots
- synthetic schedule lead
- actual hardware/device buffer depth
Lesson:
- measure actual device buffer depth
- keep synthetic counters only as diagnostics
- do not infer device health from internal stream indexes alone
### More Buffer Is Not Automatically Smoother
Increasing DeckLink scheduled frames sometimes made the reported device buffer look healthier while visible motion still stuttered.
The problem was not only "how many frames are scheduled"; it was also whether the scheduled frames represented a stable render cadence.
Lesson:
- buffer depth absorbs jitter, but it cannot fix bad cadence ownership
- a full buffer of poorly timed or repeated frames can still look wrong
### Speed-Up Catch-Up Was The Wrong Instinct
Letting the producer sprint to refill the buffer created new timing artifacts.
The render side should behave like a stable game/render loop: render at the selected cadence, record lateness, and only skip ticks when render/GPU work itself overruns.
Lesson:
- the render thread should not render faster because DeckLink is empty
- buffer drain is a failure signal, not a sprint signal
- warmup should fill buffers before playback starts
## GPU Readback Lessons
### The Original Readback Path Was The Major Collapse
Early Phase 7.5 telemetry showed `glReadPixels(..., nullptr)` into the PBO costing roughly 8-14 ms on representative samples. That was enough to collapse ready depth and cause long freezes.
Direct synchronous readback was worse on the sampled machine.
Cached-output mode, while visually invalid for live output, immediately recovered timing. That proved ongoing GPU-to-CPU transfer was the major cost in that version of the path.
Lesson:
- isolate readback cost from render cost
- use intentionally invalid cached-output experiments when diagnosing throughput
- do not assume async PBO is actually cheap on every format/driver path
### BGRA8 Packing Changed The Problem
Changing the output path so readback matched the DeckLink BGRA8 format made `asyncQueueReadPixelsMs` drop dramatically in sampled runs.
Long pauses disappeared and the remaining issue became short stutters/cadence gaps.
Lesson:
- output/readback format matters
- avoid format conversions on the readback path when possible
- BGRA8 is a good current format target for experiments
- v210/YUV packing can be deferred until cadence is stable
### DeckLink SDK Fast Transfer Was Not Available On The Test GPU
The SDK OpenGL fast-transfer path depends on hardware/extension support that was not present on the RTX 4060 Ti test machine:
- NVIDIA DVP path was gated around Quadro-style support
- `GL_AMD_pinned_memory` was not exposed
Lesson:
- SDK fast-transfer samples are useful references but not a universal fix
- unsupported fast-transfer code should not be central to the architecture
- the default path must work with ordinary consumer GPUs
## DeckLink Lessons
### DeckLink Wants Scheduled System-Memory Frames
Using `CreateVideoFrameWithBuffer()` lets DeckLink schedule frames backed by our system-memory slots.
That is the right ownership model for this app:
- render/readback writes into a slot
- DeckLink schedules a frame that references that slot
- the slot is protected until DeckLink completion
Lesson:
- system-memory slots are the contract between render and playout
- scheduled slots must not be recycled early
- completed-but-unscheduled slots can be latest-N cache entries
### Startup Needs Real Preroll
Starting scheduled playback before real rendered frames exist creates avoidable startup fragility.
The better startup shape is:
- prepare the DeckLink schedule
- start render cadence
- render warmup frames at normal cadence
- schedule those frames as preroll
- start DeckLink scheduled playback
Lesson:
- do not use black preroll as the normal startup path
- do not render faster during warmup
- if warmup cannot fill in a bounded time, fail/degrade visibly
## Buffering Lessons
### There Are Two Different Buffers
The app has at least two important frame stores:
- system-memory completed/latest-N frames
- DeckLink scheduled/device buffer
They have different ownership rules.
Completed-but-unscheduled frames are disposable if a newer frame is available and cadence needs the slot.
Scheduled frames are not disposable because DeckLink may still read them.
Lesson:
- latest-N completed frames are a cache
- scheduled frames are owned by DeckLink until completion
- keep metrics for both
### Consume-Before-Render Is The Wrong Model For Completed Frames
If the render cadence waits for completed frames to be consumed, DeckLink timing can indirectly slow the renderer.
That couples the clocks again.
Lesson:
- render cadence should keep rendering at selected cadence
- if completed cache is full, recycle/drop the oldest unscheduled completed frame
- only scheduled/in-flight saturation should prevent rendering to a safe slot
## Render Thread Lessons
### The Current Render Thread Is Still Shared
The GL render thread currently handles:
- output rendering
- input upload
- preview present
- screenshot capture
- render reset commands
- shader/resource operations
Output render can therefore be delayed by queued or inline work.
Lesson:
- "one GL thread" is not the same as "one output cadence thread"
- output render should become the highest-priority GL operation
- non-output GL work needs budgets, coalescing, or deferral
### Input Upload Is A Suspect Timing Coupling
Output render currently processes input upload work immediately before rendering the output frame.
That keeps input fresh but can steal time from the exact frame we are trying to render on cadence.
Lesson:
- measure input upload count and time immediately before output render
- test policies such as `one_before_output` or `skip_before_output`
- prefer latest-input semantics over draining every pending upload
### Preview And Screenshot Must Stay Secondary
Preview is useful, but DeckLink output is the real-time path.
Screenshot and preview share GL resources and can block or queue work on the same render thread.
Lesson:
- preview should be skipped when output is under pressure
- screenshot capture should be treated as disruptive unless proven otherwise
- forced preview/screenshot should be visible in telemetry
## Telemetry Lessons
The useful telemetry has been the telemetry that separates domains:
- output render queue wait
- render/draw time
- readback queue time
- readback fence/map/copy time
- app ready/completed queue depth
- system-memory free/rendering/completed/scheduled counts
- actual DeckLink buffered-frame count
- DeckLink schedule-call time/failures
- late/drop completion counts
Lesson:
- averages are not enough
- timing spikes matter more than steady low values
- count ownership states, not just queue depth
- keep experiment logs short and evidence-based
## Current Architectural Direction
The current direction is still sound:
```text
Render cadence loop
renders at selected output cadence
writes latest-N completed system-memory frames
never sprints to refill DeckLink
Frame store
owns free / rendering / completed / scheduled slots
recycles unscheduled completed frames when needed
protects scheduled frames until completion
DeckLink playout scheduler
consumes completed frames
tops up actual device buffer
never renders
Completion callback
releases scheduled slots
records completion result
wakes scheduler
```
## Rewrite Lesson
A full restart is not obviously the right next move.
The current repo now contains:
- working runtime/control architecture
- useful phase docs
- non-GL tests around key state machines
- real telemetry
- a clearer understanding of DeckLink and OpenGL timing
The better next step is likely a contained "V2 spine" inside the current app:
- harden the render cadence loop
- harden the frame store
- separate DeckLink scheduling
- demote preview/screenshot/input upload below output cadence
- delete old compatibility branches as they become unnecessary
A full rewrite becomes attractive only if the current GL ownership model cannot be made deterministic without excessive surgery, or if the project switches rendering API.
## Practical Rules Going Forward
- One timing authority per domain.
- Render cadence is time-driven, not completion-driven.
- DeckLink scheduling is device-buffer-driven, not render-driven.
- Completion callbacks release and report; they do not render.
- System-memory completed frames are latest-N cache entries.
- Scheduled frames are protected until DeckLink completion.
- Startup uses real rendered warmup/preroll.
- Black fallback is degraded/error behavior, not steady-state behavior.
- Output render has priority over preview, screenshot, and bulk input upload.
- Measure before adding recovery branches.

View File

@@ -372,18 +372,18 @@ DeckLink output should not start consuming before the render cadence has prepare
Initial behavior:
- configure DeckLink output without starting scheduled playback
- start the render cadence producer
- render warmup frames at the selected cadence, not faster
- wait until completed-frame depth reaches `targetWarmupFrames`
- schedule those completed frames as DeckLink preroll
- call `StartScheduledPlayback()`
- [x] configure DeckLink output without starting scheduled playback
- [x] start the render cadence producer
- [x] render warmup frames at the selected cadence, not faster
- [x] wait until scheduled preroll reaches `targetPrerollFrames`
- [x] schedule completed system-memory frames as DeckLink preroll
- [x] call `StartScheduledPlayback()`
Exit criteria:
- startup does not require the render producer to catch up by rendering faster than cadence
- DeckLink begins playback with a real completed-frame buffer
- if warmup cannot fill within a bounded timeout, startup enters degraded state with telemetry
- [x] startup does not require the render producer to catch up by rendering faster than cadence
- [x] DeckLink begins playback with a real rendered preroll buffer
- [x] if warmup cannot fill within a bounded timeout, startup enters degraded state with telemetry
### Step 5: Make DeckLink Scheduler A Separate Top-Up Loop

View File

@@ -54,8 +54,15 @@ public:
return true;
}
bool Start() override
bool PrepareOutputSchedule() override
{
mPreparedOutputSchedule = true;
return true;
}
bool StartInputStreams() override
{
mInputStreamsStarted = true;
mState.hasInputSource = true;
VideoIOFrame input;
input.bytes = mInputBytes.data();
@@ -65,11 +72,22 @@ public:
input.pixelFormat = mState.inputPixelFormat;
if (mInputCallback)
mInputCallback(input);
return true;
}
bool StartScheduledPlayback() override
{
mScheduledPlaybackStarted = true;
if (mOutputCallback)
mOutputCallback(VideoIOCompletion{ VideoIOCompletionResult::Completed });
return true;
}
bool Start() override
{
return PrepareOutputSchedule() && StartInputStreams() && StartScheduledPlayback();
}
bool Stop() override { return true; }
const VideoIOState& State() const override { return mState; }
VideoIOState& MutableState() override { return mState; }
@@ -103,6 +121,9 @@ public:
}
unsigned ScheduledFrames() const { return mScheduledFrames; }
bool PreparedOutputSchedule() const { return mPreparedOutputSchedule; }
bool InputStreamsStarted() const { return mInputStreamsStarted; }
bool ScheduledPlaybackStarted() const { return mScheduledPlaybackStarted; }
VideoIOCompletionResult LastCompletion() const { return mLastCompletion; }
uint64_t LastReadyQueueDepth() const { return mLastReadyQueueDepth; }
@@ -113,6 +134,9 @@ private:
std::array<unsigned char, 3840> mInputBytes = {};
std::array<unsigned char, 7680> mOutputBytes = {};
unsigned mScheduledFrames = 0;
bool mPreparedOutputSchedule = false;
bool mInputStreamsStarted = false;
bool mScheduledPlaybackStarted = false;
VideoIOCompletionResult mLastCompletion = VideoIOCompletionResult::Unknown;
uint64_t mLastReadyQueueDepth = 0;
};
@@ -144,6 +168,9 @@ int main()
Expect(inputSeen, "fake input callback emits generic frame");
Expect(outputSeen, "fake output callback emits generic completion");
Expect(device.PreparedOutputSchedule(), "fake output schedule was prepared");
Expect(device.InputStreamsStarted(), "fake input streams started");
Expect(device.ScheduledPlaybackStarted(), "fake scheduled playback started");
Expect(device.ScheduledFrames() == 1, "fake backend schedules one frame");
Expect(device.LastCompletion() == VideoIOCompletionResult::Completed, "fake backend records generic completion");
Expect(device.LastReadyQueueDepth() == 2, "fake backend records ready queue depth");