diff --git a/.vscode/launch.json b/.vscode/launch.json index 1b176ba..706a9ad 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,6 +17,50 @@ "moduleLoad": true }, "preLaunchTask": "Build LoopThroughWithOpenGLCompositing Debug x64" + }, + { + "name": "Debug LoopThroughWithOpenGLCompositing - sync readback experiment", + "type": "cppvsdbg", + "request": "launch", + "program": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug\\LoopThroughWithOpenGLCompositing.exe", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug", + "environment": [ + { + "name": "VST_OUTPUT_READBACK_MODE", + "value": "sync" + } + ], + "console": "internalConsole", + "symbolSearchPath": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug", + "requireExactSource": true, + "logging": { + "moduleLoad": true + }, + "preLaunchTask": "Build LoopThroughWithOpenGLCompositing Debug x64" + }, + { + "name": "Debug LoopThroughWithOpenGLCompositing - cached output experiment", + "type": "cppvsdbg", + "request": "launch", + "program": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug\\LoopThroughWithOpenGLCompositing.exe", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug", + "environment": [ + { + "name": "VST_OUTPUT_READBACK_MODE", + "value": "cached_only" + } + ], + "console": "internalConsole", + "symbolSearchPath": "${workspaceFolder}\\build\\vs2022-x64-debug\\Debug", + "requireExactSource": true, + "logging": { + "moduleLoad": true + }, + "preLaunchTask": "Build LoopThroughWithOpenGLCompositing Debug x64" } ] } diff --git a/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.cpp b/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.cpp index 32d1ca5..281080a 100644 --- a/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.cpp @@ -18,6 +18,7 @@ RenderEngine::RenderEngine( mRenderPass(mRenderer), mRenderPipeline(mRenderer, runtimeSnapshotProvider, healthTelemetry, std::move(renderEffect), std::move(screenshotReady), std::move(previewPaint)), mShaderPrograms(mRenderer, runtimeSnapshotProvider), + mHealthTelemetry(healthTelemetry), mHdc(hdc), mHglrc(hglrc), mFrameStateResolver(runtimeSnapshotProvider) @@ -546,7 +547,11 @@ bool RenderEngine::RequestOutputFrame(const RenderPipelineFrameContext& context, { if (mRenderThreadRunning) { - return TryInvokeOnRenderThread("output-render", [this, &context, &outputFrame]() { + const auto queuedAt = std::chrono::steady_clock::now(); + return TryInvokeOnRenderThread("output-render", [this, &context, &outputFrame, queuedAt]() { + const auto startedAt = std::chrono::steady_clock::now(); + const double queueWaitMilliseconds = std::chrono::duration_cast>(startedAt - queuedAt).count(); + mHealthTelemetry.TryRecordOutputRenderQueueWait(queueWaitMilliseconds); mRenderCommandQueue.RequestOutputFrame({ context.videoState, context.completion }); RenderOutputFrameRequest request; return mRenderCommandQueue.TryTakeOutputFrame(request) && diff --git a/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.h b/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.h index 4482808..72ee54d 100644 --- a/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.h +++ b/apps/LoopThroughWithOpenGLCompositing/gl/RenderEngine.h @@ -209,6 +209,7 @@ private: OpenGLRenderPass mRenderPass; OpenGLRenderPipeline mRenderPipeline; OpenGLShaderPrograms mShaderPrograms; + HealthTelemetry& mHealthTelemetry; HDC mHdc; HGLRC mHglrc; diff --git a/apps/LoopThroughWithOpenGLCompositing/gl/composite/OpenGLComposite.cpp b/apps/LoopThroughWithOpenGLCompositing/gl/composite/OpenGLComposite.cpp index 6efbf99..0978a9d 100644 --- a/apps/LoopThroughWithOpenGLCompositing/gl/composite/OpenGLComposite.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/gl/composite/OpenGLComposite.cpp @@ -164,6 +164,9 @@ error: void OpenGLComposite::paintGL(bool force) { + if (mRuntimeUpdateController) + mRuntimeUpdateController->ProcessRuntimeWork(); + if (!force) { if (IsIconic(hGLWnd)) @@ -171,6 +174,12 @@ void OpenGLComposite::paintGL(bool force) } const unsigned previewFps = mRuntimeStore ? mRuntimeStore->GetConfiguredPreviewFps() : 30u; + if (!force && mVideoBackend && mVideoBackend->ShouldPrioritizeOutputOverPreview()) + { + ValidateRect(hGLWnd, NULL); + return; + } + if (!mRenderEngine->TryPresentPreview(force, previewFps, mVideoBackend->OutputFrameWidth(), mVideoBackend->OutputFrameHeight())) { ValidateRect(hGLWnd, NULL); @@ -261,6 +270,9 @@ bool OpenGLComposite::Start() if (!mRenderEngine->StartRenderThread()) return false; + if (mRuntimeUpdateController) + mRuntimeUpdateController->ProcessRuntimeWork(); + if (mVideoBackend->Start()) return true; @@ -351,9 +363,6 @@ bool OpenGLComposite::RequestScreenshot(std::string& error) void OpenGLComposite::renderEffect() { - if (mRuntimeUpdateController) - mRuntimeUpdateController->ProcessRuntimeWork(); - const RenderFrameInput frameInput = BuildRenderFrameInput(); RenderFrame(frameInput); } diff --git a/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.cpp b/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.cpp index 93b937b..f84069f 100644 --- a/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.cpp @@ -8,7 +8,9 @@ #include #include +#include #include +#include OpenGLRenderPipeline::OpenGLRenderPipeline( OpenGLRenderer& renderer, @@ -22,7 +24,8 @@ OpenGLRenderPipeline::OpenGLRenderPipeline( mHealthTelemetry(healthTelemetry), mRenderEffect(renderEffect), mOutputReady(outputReady), - mPaint(paint) + mPaint(paint), + mOutputReadbackMode(ReadOutputReadbackModeFromEnvironment()) { } @@ -53,9 +56,22 @@ bool OpenGLRenderPipeline::RenderFrame(const RenderPipelineFrameContext& context mHealthTelemetry.TryRecordPerformanceStats(state.frameBudgetMilliseconds, renderMilliseconds); mRuntimeSnapshotProvider.AdvanceFrame(); - ReadOutputFrame(state, outputFrame); - if (mPaint) - mPaint(); + OutputReadbackTiming readbackTiming = ReadOutputFrame(state, outputFrame); + mHealthTelemetry.TryRecordOutputRenderPipelineTiming( + renderMilliseconds, + readbackTiming.fenceWaitMilliseconds, + readbackTiming.mapMilliseconds, + readbackTiming.copyMilliseconds, + readbackTiming.cachedCopyMilliseconds, + readbackTiming.asyncQueueMilliseconds, + readbackTiming.asyncQueueBufferMilliseconds, + readbackTiming.asyncQueueSetupMilliseconds, + readbackTiming.asyncQueueReadPixelsMilliseconds, + readbackTiming.asyncQueueFenceMilliseconds, + readbackTiming.syncReadMilliseconds, + readbackTiming.asyncReadbackMissed, + readbackTiming.cachedFallbackUsed, + readbackTiming.syncFallbackUsed); return true; } @@ -151,8 +167,9 @@ void OpenGLRenderPipeline::FlushAsyncReadbackPipeline() mAsyncReadbackReadIndex = 0; } -void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state) +bool OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing) { + const auto queueStartTime = std::chrono::steady_clock::now(); const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10; const std::size_t requiredBytes = static_cast(state.outputFrameRowBytes) * state.outputFrameSize.height; const GLenum format = usePackedOutput ? GL_RGBA : GL_BGRA; @@ -161,8 +178,16 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state) const GLsizei readWidth = static_cast(usePackedOutput ? state.outputPackTextureWidth : state.outputFrameSize.width); const GLsizei readHeight = static_cast(state.outputFrameSize.height); + const auto finishTiming = [&timing, queueStartTime]() { + const auto queueEndTime = std::chrono::steady_clock::now(); + timing.asyncQueueMilliseconds += std::chrono::duration_cast>(queueEndTime - queueStartTime).count(); + }; + if (requiredBytes == 0) - return; + { + finishTiming(); + return false; + } if (mAsyncReadbackBytes != requiredBytes || mAsyncReadbackFormat != format @@ -173,30 +198,50 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state) mAsyncReadbackType = type; mAsyncReadbackFramebuffer = framebuffer; if (!EnsureAsyncReadbackBuffers(requiredBytes)) - return; + { + finishTiming(); + return false; + } } AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackWriteIndex]; - if (slot.fence != nullptr) + if (slot.inFlight) { - glDeleteSync(slot.fence); - slot.fence = nullptr; + finishTiming(); + return false; } + auto stageStartTime = std::chrono::steady_clock::now(); glPixelStorei(GL_PACK_ALIGNMENT, 4); glPixelStorei(GL_PACK_ROW_LENGTH, 0); glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer); + auto stageEndTime = std::chrono::steady_clock::now(); + timing.asyncQueueSetupMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); + + stageStartTime = std::chrono::steady_clock::now(); glBufferData(GL_PIXEL_PACK_BUFFER, static_cast(requiredBytes), nullptr, GL_STREAM_READ); + stageEndTime = std::chrono::steady_clock::now(); + timing.asyncQueueBufferMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); + + stageStartTime = std::chrono::steady_clock::now(); glReadPixels(0, 0, readWidth, readHeight, format, type, nullptr); + stageEndTime = std::chrono::steady_clock::now(); + timing.asyncQueueReadPixelsMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); + + stageStartTime = std::chrono::steady_clock::now(); slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + stageEndTime = std::chrono::steady_clock::now(); + timing.asyncQueueFenceMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); slot.inFlight = slot.fence != nullptr; glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); mAsyncReadbackWriteIndex = (mAsyncReadbackWriteIndex + 1) % mAsyncReadbackSlots.size(); + finishTiming(); + return slot.inFlight; } -bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds) +bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing) { if (mAsyncReadbackBytes == 0 || outputFrame.bytes == nullptr) return false; @@ -206,15 +251,24 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra return false; const GLenum waitFlags = timeoutNanoseconds > 0 ? GL_SYNC_FLUSH_COMMANDS_BIT : 0; + const auto waitStartTime = std::chrono::steady_clock::now(); const GLenum waitResult = glClientWaitSync(slot.fence, waitFlags, timeoutNanoseconds); + const auto waitEndTime = std::chrono::steady_clock::now(); + timing.fenceWaitMilliseconds += std::chrono::duration_cast>(waitEndTime - waitStartTime).count(); if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED) + { + timing.asyncReadbackMissed = true; return false; + } glDeleteSync(slot.fence); slot.fence = nullptr; glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer); + const auto mapStartTime = std::chrono::steady_clock::now(); void* mappedBytes = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); + const auto mapEndTime = std::chrono::steady_clock::now(); + timing.mapMilliseconds += std::chrono::duration_cast>(mapEndTime - mapStartTime).count(); if (mappedBytes == nullptr) { glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -223,7 +277,10 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra return false; } + const auto copyStartTime = std::chrono::steady_clock::now(); std::memcpy(outputFrame.bytes, mappedBytes, slot.sizeBytes); + const auto copyEndTime = std::chrono::steady_clock::now(); + timing.copyMilliseconds += std::chrono::duration_cast>(copyEndTime - copyStartTime).count(); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -243,8 +300,26 @@ void OpenGLRenderPipeline::CacheOutputFrame(const VideoIOOutputFrame& outputFram std::memcpy(mCachedOutputFrame.data(), outputFrame.bytes, byteCount); } -void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes) +bool OpenGLRenderPipeline::TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const { + if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0) + return false; + + const std::size_t byteCount = static_cast(outputFrame.rowBytes) * outputFrame.height; + if (mCachedOutputFrame.size() != byteCount) + return false; + + const auto copyStartTime = std::chrono::steady_clock::now(); + std::memcpy(outputFrame.bytes, mCachedOutputFrame.data(), byteCount); + const auto copyEndTime = std::chrono::steady_clock::now(); + timing.cachedCopyMilliseconds += std::chrono::duration_cast>(copyEndTime - copyStartTime).count(); + timing.cachedFallbackUsed = true; + return true; +} + +void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing) +{ + const auto readStartTime = std::chrono::steady_clock::now(); const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10; glPixelStorei(GL_PACK_ALIGNMENT, 4); @@ -259,24 +334,78 @@ void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& stat glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputFramebuffer()); glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, destinationBytes); } + const auto readEndTime = std::chrono::steady_clock::now(); + timing.syncReadMilliseconds += std::chrono::duration_cast>(readEndTime - readStartTime).count(); + timing.syncFallbackUsed = true; } -void OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame) +OpenGLRenderPipeline::OutputReadbackTiming OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame) { - if (TryConsumeAsyncReadback(outputFrame, 500000)) + OutputReadbackTiming timing; + + if (mOutputReadbackMode == OutputReadbackMode::Synchronous) { - QueueAsyncReadback(state); - return; + if (outputFrame.bytes != nullptr) + { + ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); + CacheOutputFrame(outputFrame); + } + return timing; } - // If async readback misses the playout deadline, prefer a fresh synchronous - // frame over reusing stale cached output, then restart the async pipeline. - if (outputFrame.bytes != nullptr) + if (mOutputReadbackMode == OutputReadbackMode::CachedOnly) { - ReadOutputFrameSynchronously(state, outputFrame.bytes); + if (TryCopyCachedOutputFrame(outputFrame, timing)) + return timing; + + if (outputFrame.bytes != nullptr) + { + ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); + CacheOutputFrame(outputFrame); + } + return timing; + } + + if (TryConsumeAsyncReadback(outputFrame, 500000, timing)) + { + (void)QueueAsyncReadback(state, timing); + return timing; + } + + if (TryCopyCachedOutputFrame(outputFrame, timing)) + { + (void)QueueAsyncReadback(state, timing); + return timing; + } + + // Bootstrap only: until the first async readback has produced cached output, + // use one synchronous readback so DeckLink has a valid frame to schedule. + if (outputFrame.bytes != nullptr && mCachedOutputFrame.empty()) + { + ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); CacheOutputFrame(outputFrame); } FlushAsyncReadbackPipeline(); - QueueAsyncReadback(state); + (void)QueueAsyncReadback(state, timing); + return timing; +} + +OpenGLRenderPipeline::OutputReadbackMode OpenGLRenderPipeline::ReadOutputReadbackModeFromEnvironment() +{ + char* mode = nullptr; + std::size_t modeSize = 0; + if (_dupenv_s(&mode, &modeSize, "VST_OUTPUT_READBACK_MODE") != 0 || mode == nullptr) + return OutputReadbackMode::AsyncPbo; + + const std::string modeValue(mode); + std::free(mode); + if (modeValue == "async_pbo") + return OutputReadbackMode::AsyncPbo; + if (modeValue == "sync") + return OutputReadbackMode::Synchronous; + if (modeValue == "cached_only") + return OutputReadbackMode::CachedOnly; + + return OutputReadbackMode::AsyncPbo; } diff --git a/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.h b/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.h index 4fc6cce..59d7bf2 100644 --- a/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.h +++ b/apps/LoopThroughWithOpenGLCompositing/gl/pipeline/OpenGLRenderPipeline.h @@ -36,6 +36,13 @@ public: bool RenderFrame(const RenderPipelineFrameContext& context, VideoIOOutputFrame& outputFrame); private: + enum class OutputReadbackMode + { + AsyncPbo, + Synchronous, + CachedOnly + }; + struct AsyncReadbackSlot { GLuint pixelPackBuffer = 0; @@ -44,15 +51,34 @@ private: bool inFlight = false; }; + struct OutputReadbackTiming + { + double fenceWaitMilliseconds = 0.0; + double mapMilliseconds = 0.0; + double copyMilliseconds = 0.0; + double cachedCopyMilliseconds = 0.0; + double asyncQueueMilliseconds = 0.0; + double asyncQueueBufferMilliseconds = 0.0; + double asyncQueueSetupMilliseconds = 0.0; + double asyncQueueReadPixelsMilliseconds = 0.0; + double asyncQueueFenceMilliseconds = 0.0; + double syncReadMilliseconds = 0.0; + bool asyncReadbackMissed = false; + bool cachedFallbackUsed = false; + bool syncFallbackUsed = false; + }; + bool EnsureAsyncReadbackBuffers(std::size_t requiredBytes); void ResetAsyncReadbackState(); void FlushAsyncReadbackPipeline(); - void QueueAsyncReadback(const VideoIOState& state); - bool TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds); + bool QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing); + bool TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing); void CacheOutputFrame(const VideoIOOutputFrame& outputFrame); - void ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes); + bool TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const; + void ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing); void PackOutputFor10Bit(const VideoIOState& state); - void ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame); + OutputReadbackTiming ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame); + static OutputReadbackMode ReadOutputReadbackModeFromEnvironment(); OpenGLRenderer& mRenderer; RuntimeSnapshotProvider& mRuntimeSnapshotProvider; @@ -60,6 +86,7 @@ private: RenderEffectCallback mRenderEffect; OutputReadyCallback mOutputReady; PaintCallback mPaint; + OutputReadbackMode mOutputReadbackMode = OutputReadbackMode::AsyncPbo; std::array mAsyncReadbackSlots; std::size_t mAsyncReadbackWriteIndex = 0; std::size_t mAsyncReadbackReadIndex = 0; diff --git a/apps/LoopThroughWithOpenGLCompositing/runtime/presentation/RuntimeStatePresenter.cpp b/apps/LoopThroughWithOpenGLCompositing/runtime/presentation/RuntimeStatePresenter.cpp index 8aae97f..051a4b7 100644 --- a/apps/LoopThroughWithOpenGLCompositing/runtime/presentation/RuntimeStatePresenter.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/runtime/presentation/RuntimeStatePresenter.cpp @@ -95,6 +95,24 @@ JsonValue RuntimeStatePresenter::BuildRuntimeStateValue(const RuntimeStore& runt outputRender.set("renderMs", JsonValue(telemetrySnapshot.backendPlayout.outputRenderMilliseconds)); outputRender.set("smoothedRenderMs", JsonValue(telemetrySnapshot.backendPlayout.smoothedOutputRenderMilliseconds)); outputRender.set("maxRenderMs", JsonValue(telemetrySnapshot.backendPlayout.maxOutputRenderMilliseconds)); + outputRender.set("acquireFrameMs", JsonValue(telemetrySnapshot.backendPlayout.outputFrameAcquireMilliseconds)); + outputRender.set("renderRequestMs", JsonValue(telemetrySnapshot.backendPlayout.outputFrameRenderRequestMilliseconds)); + outputRender.set("endAccessMs", JsonValue(telemetrySnapshot.backendPlayout.outputFrameEndAccessMilliseconds)); + outputRender.set("queueWaitMs", JsonValue(telemetrySnapshot.backendPlayout.outputRenderQueueWaitMilliseconds)); + outputRender.set("drawMs", JsonValue(telemetrySnapshot.backendPlayout.outputRenderDrawMilliseconds)); + outputRender.set("fenceWaitMs", JsonValue(telemetrySnapshot.backendPlayout.outputReadbackFenceWaitMilliseconds)); + outputRender.set("mapMs", JsonValue(telemetrySnapshot.backendPlayout.outputReadbackMapMilliseconds)); + outputRender.set("readbackCopyMs", JsonValue(telemetrySnapshot.backendPlayout.outputReadbackCopyMilliseconds)); + outputRender.set("cachedCopyMs", JsonValue(telemetrySnapshot.backendPlayout.outputCachedCopyMilliseconds)); + outputRender.set("asyncQueueMs", JsonValue(telemetrySnapshot.backendPlayout.outputAsyncQueueMilliseconds)); + outputRender.set("asyncQueueBufferMs", JsonValue(telemetrySnapshot.backendPlayout.outputAsyncQueueBufferMilliseconds)); + outputRender.set("asyncQueueSetupMs", JsonValue(telemetrySnapshot.backendPlayout.outputAsyncQueueSetupMilliseconds)); + outputRender.set("asyncQueueReadPixelsMs", JsonValue(telemetrySnapshot.backendPlayout.outputAsyncQueueReadPixelsMilliseconds)); + outputRender.set("asyncQueueFenceMs", JsonValue(telemetrySnapshot.backendPlayout.outputAsyncQueueFenceMilliseconds)); + outputRender.set("syncReadMs", JsonValue(telemetrySnapshot.backendPlayout.outputSyncReadMilliseconds)); + outputRender.set("asyncReadbackMissCount", JsonValue(static_cast(telemetrySnapshot.backendPlayout.outputAsyncReadbackMissCount))); + outputRender.set("cachedFallbackCount", JsonValue(static_cast(telemetrySnapshot.backendPlayout.outputCachedFallbackCount))); + outputRender.set("syncFallbackCount", JsonValue(static_cast(telemetrySnapshot.backendPlayout.outputSyncFallbackCount))); JsonValue recovery = JsonValue::MakeObject(); recovery.set("completionResult", JsonValue(telemetrySnapshot.backendPlayout.completionResult)); diff --git a/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.cpp b/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.cpp index c0714ed..7380214 100644 --- a/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.cpp @@ -212,6 +212,7 @@ void HealthTelemetry::RecordBackendPlayoutHealth(const std::string& lifecycleSta std::size_t minReadyQueueDepth, std::size_t maxReadyQueueDepth, uint64_t readyQueueZeroDepthCount, uint64_t readyQueuePoppedCount, uint64_t readyQueueDroppedCount, uint64_t readyQueueUnderrunCount, double outputRenderMilliseconds, double smoothedOutputRenderMilliseconds, double maxOutputRenderMilliseconds, + double outputFrameAcquireMilliseconds, double outputFrameRenderRequestMilliseconds, double outputFrameEndAccessMilliseconds, uint64_t completedFrameIndex, uint64_t scheduledFrameIndex, uint64_t scheduledLeadFrames, uint64_t measuredLagFrames, uint64_t catchUpFrames, uint64_t lateStreak, uint64_t dropStreak, uint64_t lateFrameCount, uint64_t droppedFrameCount, uint64_t flushedFrameCount, @@ -232,6 +233,9 @@ void HealthTelemetry::RecordBackendPlayoutHealth(const std::string& lifecycleSta mBackendPlayout.outputRenderMilliseconds = std::max(outputRenderMilliseconds, 0.0); mBackendPlayout.smoothedOutputRenderMilliseconds = std::max(smoothedOutputRenderMilliseconds, 0.0); mBackendPlayout.maxOutputRenderMilliseconds = std::max(maxOutputRenderMilliseconds, 0.0); + mBackendPlayout.outputFrameAcquireMilliseconds = std::max(outputFrameAcquireMilliseconds, 0.0); + mBackendPlayout.outputFrameRenderRequestMilliseconds = std::max(outputFrameRenderRequestMilliseconds, 0.0); + mBackendPlayout.outputFrameEndAccessMilliseconds = std::max(outputFrameEndAccessMilliseconds, 0.0); mBackendPlayout.completedFrameIndex = completedFrameIndex; mBackendPlayout.scheduledFrameIndex = scheduledFrameIndex; mBackendPlayout.scheduledLeadFrames = scheduledLeadFrames; @@ -251,6 +255,7 @@ bool HealthTelemetry::TryRecordBackendPlayoutHealth(const std::string& lifecycle std::size_t minReadyQueueDepth, std::size_t maxReadyQueueDepth, uint64_t readyQueueZeroDepthCount, uint64_t readyQueuePoppedCount, uint64_t readyQueueDroppedCount, uint64_t readyQueueUnderrunCount, double outputRenderMilliseconds, double smoothedOutputRenderMilliseconds, double maxOutputRenderMilliseconds, + double outputFrameAcquireMilliseconds, double outputFrameRenderRequestMilliseconds, double outputFrameEndAccessMilliseconds, uint64_t completedFrameIndex, uint64_t scheduledFrameIndex, uint64_t scheduledLeadFrames, uint64_t measuredLagFrames, uint64_t catchUpFrames, uint64_t lateStreak, uint64_t dropStreak, uint64_t lateFrameCount, uint64_t droppedFrameCount, uint64_t flushedFrameCount, @@ -274,6 +279,9 @@ bool HealthTelemetry::TryRecordBackendPlayoutHealth(const std::string& lifecycle mBackendPlayout.outputRenderMilliseconds = std::max(outputRenderMilliseconds, 0.0); mBackendPlayout.smoothedOutputRenderMilliseconds = std::max(smoothedOutputRenderMilliseconds, 0.0); mBackendPlayout.maxOutputRenderMilliseconds = std::max(maxOutputRenderMilliseconds, 0.0); + mBackendPlayout.outputFrameAcquireMilliseconds = std::max(outputFrameAcquireMilliseconds, 0.0); + mBackendPlayout.outputFrameRenderRequestMilliseconds = std::max(outputFrameRenderRequestMilliseconds, 0.0); + mBackendPlayout.outputFrameEndAccessMilliseconds = std::max(outputFrameEndAccessMilliseconds, 0.0); mBackendPlayout.completedFrameIndex = completedFrameIndex; mBackendPlayout.scheduledFrameIndex = scheduledFrameIndex; mBackendPlayout.scheduledLeadFrames = scheduledLeadFrames; @@ -289,6 +297,98 @@ bool HealthTelemetry::TryRecordBackendPlayoutHealth(const std::string& lifecycle return true; } +void HealthTelemetry::RecordOutputRenderQueueWait(double queueWaitMilliseconds) +{ + std::lock_guard lock(mMutex); + mBackendPlayout.outputRenderQueueWaitMilliseconds = std::max(queueWaitMilliseconds, 0.0); +} + +bool HealthTelemetry::TryRecordOutputRenderQueueWait(double queueWaitMilliseconds) +{ + std::unique_lock lock(mMutex, std::try_to_lock); + if (!lock.owns_lock()) + return false; + + mBackendPlayout.outputRenderQueueWaitMilliseconds = std::max(queueWaitMilliseconds, 0.0); + return true; +} + +void HealthTelemetry::RecordOutputRenderPipelineTiming( + double drawMilliseconds, + double fenceWaitMilliseconds, + double mapMilliseconds, + double readbackCopyMilliseconds, + double cachedCopyMilliseconds, + double asyncQueueMilliseconds, + double asyncQueueBufferMilliseconds, + double asyncQueueSetupMilliseconds, + double asyncQueueReadPixelsMilliseconds, + double asyncQueueFenceMilliseconds, + double syncReadMilliseconds, + bool asyncReadbackMissed, + bool cachedFallbackUsed, + bool syncFallbackUsed) +{ + std::lock_guard lock(mMutex); + mBackendPlayout.outputRenderDrawMilliseconds = std::max(drawMilliseconds, 0.0); + mBackendPlayout.outputReadbackFenceWaitMilliseconds = std::max(fenceWaitMilliseconds, 0.0); + mBackendPlayout.outputReadbackMapMilliseconds = std::max(mapMilliseconds, 0.0); + mBackendPlayout.outputReadbackCopyMilliseconds = std::max(readbackCopyMilliseconds, 0.0); + mBackendPlayout.outputCachedCopyMilliseconds = std::max(cachedCopyMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueMilliseconds = std::max(asyncQueueMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueBufferMilliseconds = std::max(asyncQueueBufferMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueSetupMilliseconds = std::max(asyncQueueSetupMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueReadPixelsMilliseconds = std::max(asyncQueueReadPixelsMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueFenceMilliseconds = std::max(asyncQueueFenceMilliseconds, 0.0); + mBackendPlayout.outputSyncReadMilliseconds = std::max(syncReadMilliseconds, 0.0); + if (asyncReadbackMissed) + ++mBackendPlayout.outputAsyncReadbackMissCount; + if (cachedFallbackUsed) + ++mBackendPlayout.outputCachedFallbackCount; + if (syncFallbackUsed) + ++mBackendPlayout.outputSyncFallbackCount; +} + +bool HealthTelemetry::TryRecordOutputRenderPipelineTiming( + double drawMilliseconds, + double fenceWaitMilliseconds, + double mapMilliseconds, + double readbackCopyMilliseconds, + double cachedCopyMilliseconds, + double asyncQueueMilliseconds, + double asyncQueueBufferMilliseconds, + double asyncQueueSetupMilliseconds, + double asyncQueueReadPixelsMilliseconds, + double asyncQueueFenceMilliseconds, + double syncReadMilliseconds, + bool asyncReadbackMissed, + bool cachedFallbackUsed, + bool syncFallbackUsed) +{ + std::unique_lock lock(mMutex, std::try_to_lock); + if (!lock.owns_lock()) + return false; + + mBackendPlayout.outputRenderDrawMilliseconds = std::max(drawMilliseconds, 0.0); + mBackendPlayout.outputReadbackFenceWaitMilliseconds = std::max(fenceWaitMilliseconds, 0.0); + mBackendPlayout.outputReadbackMapMilliseconds = std::max(mapMilliseconds, 0.0); + mBackendPlayout.outputReadbackCopyMilliseconds = std::max(readbackCopyMilliseconds, 0.0); + mBackendPlayout.outputCachedCopyMilliseconds = std::max(cachedCopyMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueMilliseconds = std::max(asyncQueueMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueBufferMilliseconds = std::max(asyncQueueBufferMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueSetupMilliseconds = std::max(asyncQueueSetupMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueReadPixelsMilliseconds = std::max(asyncQueueReadPixelsMilliseconds, 0.0); + mBackendPlayout.outputAsyncQueueFenceMilliseconds = std::max(asyncQueueFenceMilliseconds, 0.0); + mBackendPlayout.outputSyncReadMilliseconds = std::max(syncReadMilliseconds, 0.0); + if (asyncReadbackMissed) + ++mBackendPlayout.outputAsyncReadbackMissCount; + if (cachedFallbackUsed) + ++mBackendPlayout.outputCachedFallbackCount; + if (syncFallbackUsed) + ++mBackendPlayout.outputSyncFallbackCount; + return true; +} + HealthTelemetry::SignalStatusSnapshot HealthTelemetry::GetSignalStatusSnapshot() const { std::lock_guard lock(mMutex); diff --git a/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.h b/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.h index 727affa..2398894 100644 --- a/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.h +++ b/apps/LoopThroughWithOpenGLCompositing/runtime/telemetry/HealthTelemetry.h @@ -97,6 +97,24 @@ public: double outputRenderMilliseconds = 0.0; double smoothedOutputRenderMilliseconds = 0.0; double maxOutputRenderMilliseconds = 0.0; + double outputFrameAcquireMilliseconds = 0.0; + double outputFrameRenderRequestMilliseconds = 0.0; + double outputFrameEndAccessMilliseconds = 0.0; + double outputRenderQueueWaitMilliseconds = 0.0; + double outputRenderDrawMilliseconds = 0.0; + double outputReadbackFenceWaitMilliseconds = 0.0; + double outputReadbackMapMilliseconds = 0.0; + double outputReadbackCopyMilliseconds = 0.0; + double outputCachedCopyMilliseconds = 0.0; + double outputAsyncQueueMilliseconds = 0.0; + double outputAsyncQueueBufferMilliseconds = 0.0; + double outputAsyncQueueSetupMilliseconds = 0.0; + double outputAsyncQueueReadPixelsMilliseconds = 0.0; + double outputAsyncQueueFenceMilliseconds = 0.0; + double outputSyncReadMilliseconds = 0.0; + uint64_t outputAsyncReadbackMissCount = 0; + uint64_t outputCachedFallbackCount = 0; + uint64_t outputSyncFallbackCount = 0; uint64_t completedFrameIndex = 0; uint64_t scheduledFrameIndex = 0; uint64_t scheduledLeadFrames = 0; @@ -161,6 +179,7 @@ public: std::size_t minReadyQueueDepth, std::size_t maxReadyQueueDepth, uint64_t readyQueueZeroDepthCount, uint64_t readyQueuePoppedCount, uint64_t readyQueueDroppedCount, uint64_t readyQueueUnderrunCount, double outputRenderMilliseconds, double smoothedOutputRenderMilliseconds, double maxOutputRenderMilliseconds, + double outputFrameAcquireMilliseconds, double outputFrameRenderRequestMilliseconds, double outputFrameEndAccessMilliseconds, uint64_t completedFrameIndex, uint64_t scheduledFrameIndex, uint64_t scheduledLeadFrames, uint64_t measuredLagFrames, uint64_t catchUpFrames, uint64_t lateStreak, uint64_t dropStreak, uint64_t lateFrameCount, uint64_t droppedFrameCount, uint64_t flushedFrameCount, @@ -170,11 +189,46 @@ public: std::size_t minReadyQueueDepth, std::size_t maxReadyQueueDepth, uint64_t readyQueueZeroDepthCount, uint64_t readyQueuePoppedCount, uint64_t readyQueueDroppedCount, uint64_t readyQueueUnderrunCount, double outputRenderMilliseconds, double smoothedOutputRenderMilliseconds, double maxOutputRenderMilliseconds, + double outputFrameAcquireMilliseconds, double outputFrameRenderRequestMilliseconds, double outputFrameEndAccessMilliseconds, uint64_t completedFrameIndex, uint64_t scheduledFrameIndex, uint64_t scheduledLeadFrames, uint64_t measuredLagFrames, uint64_t catchUpFrames, uint64_t lateStreak, uint64_t dropStreak, uint64_t lateFrameCount, uint64_t droppedFrameCount, uint64_t flushedFrameCount, bool degraded, const std::string& statusMessage); + void RecordOutputRenderQueueWait(double queueWaitMilliseconds); + bool TryRecordOutputRenderQueueWait(double queueWaitMilliseconds); + + void RecordOutputRenderPipelineTiming( + double drawMilliseconds, + double fenceWaitMilliseconds, + double mapMilliseconds, + double readbackCopyMilliseconds, + double cachedCopyMilliseconds, + double asyncQueueMilliseconds, + double asyncQueueBufferMilliseconds, + double asyncQueueSetupMilliseconds, + double asyncQueueReadPixelsMilliseconds, + double asyncQueueFenceMilliseconds, + double syncReadMilliseconds, + bool asyncReadbackMissed, + bool cachedFallbackUsed, + bool syncFallbackUsed); + bool TryRecordOutputRenderPipelineTiming( + double drawMilliseconds, + double fenceWaitMilliseconds, + double mapMilliseconds, + double readbackCopyMilliseconds, + double cachedCopyMilliseconds, + double asyncQueueMilliseconds, + double asyncQueueBufferMilliseconds, + double asyncQueueSetupMilliseconds, + double asyncQueueReadPixelsMilliseconds, + double asyncQueueFenceMilliseconds, + double syncReadMilliseconds, + bool asyncReadbackMissed, + bool cachedFallbackUsed, + bool syncFallbackUsed); + SignalStatusSnapshot GetSignalStatusSnapshot() const; VideoIOStatusSnapshot GetVideoIOStatusSnapshot() const; PerformanceSnapshot GetPerformanceSnapshot() const; diff --git a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp index 8dca58a..29ce82c 100644 --- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp @@ -9,12 +9,14 @@ #include #include #include +#include #include VideoBackend::VideoBackend(RenderEngine& renderEngine, HealthTelemetry& healthTelemetry, RuntimeEventDispatcher& runtimeEventDispatcher) : mHealthTelemetry(healthTelemetry), mRuntimeEventDispatcher(runtimeEventDispatcher), mPlayoutPolicy(NormalizeVideoPlayoutPolicy(VideoPlayoutPolicy())), + mOutputProductionController(mPlayoutPolicy), mReadyOutputQueue(mPlayoutPolicy), mVideoIODevice(std::make_unique()), mBridge(std::make_unique(renderEngine)) @@ -80,6 +82,7 @@ bool VideoBackend::ConfigureInput(const VideoFormat& inputVideoMode, std::string bool VideoBackend::ConfigureOutput(const VideoFormat& outputVideoMode, bool externalKeyingEnabled, std::string& error) { mPlayoutPolicy = NormalizeVideoPlayoutPolicy(mPlayoutPolicy); + mOutputProductionController.Configure(mPlayoutPolicy); mReadyOutputQueue.Configure(mPlayoutPolicy); if (mLifecycle.State() != VideoBackendLifecycleState::Configuring) ApplyLifecycleTransition(VideoBackendLifecycleState::Configuring, "Configuring video backend output."); @@ -101,7 +104,10 @@ bool VideoBackend::Start() StartOutputCompletionWorker(); const bool started = mVideoIODevice->Start(); if (started) + { + StartOutputProducerWorker(); ApplyLifecycleTransition(VideoBackendLifecycleState::Running, "Video backend started."); + } else { StopOutputCompletionWorker(); @@ -113,6 +119,7 @@ bool VideoBackend::Start() bool VideoBackend::Stop() { ApplyLifecycleTransition(VideoBackendLifecycleState::Stopping, "Video backend stopping."); + StopOutputProducerWorker(); const bool stopped = mVideoIODevice->Stop(); StopOutputCompletionWorker(); if (stopped) @@ -232,6 +239,12 @@ const std::string& VideoBackend::StatusMessage() const return mVideoIODevice->StatusMessage(); } +bool VideoBackend::ShouldPrioritizeOutputOverPreview() const +{ + const RenderOutputQueueMetrics metrics = mReadyOutputQueue.GetMetrics(); + return metrics.depth < static_cast(mPlayoutPolicy.targetReadyFrames); +} + void VideoBackend::SetStatusMessage(const std::string& message) { mVideoIODevice->SetStatusMessage(message); @@ -288,27 +301,36 @@ void VideoBackend::HandleOutputFrameCompletion(const VideoIOCompletion& completi void VideoBackend::StartOutputCompletionWorker() { - std::lock_guard lock(mOutputCompletionMutex); - if (mOutputCompletionWorkerRunning) - return; + { + std::lock_guard lock(mOutputCompletionMutex); + if (mOutputCompletionWorkerRunning) + return; - mPendingOutputCompletions.clear(); - mReadyOutputQueue.Clear(); - mNextReadyOutputFrameIndex = 0; - mHasReadyQueueDepthBaseline = false; - mMinReadyQueueDepth = 0; - mMaxReadyQueueDepth = 0; - mReadyQueueZeroDepthCount = 0; - mOutputRenderMilliseconds = 0.0; - mSmoothedOutputRenderMilliseconds = 0.0; - mMaxOutputRenderMilliseconds = 0.0; - mOutputCompletionWorkerStopping = false; - mOutputCompletionWorkerRunning = true; - mOutputCompletionWorker = std::thread(&VideoBackend::OutputCompletionWorkerMain, this); + mPendingOutputCompletions.clear(); + mReadyOutputQueue.Clear(); + mNextReadyOutputFrameIndex = 0; + mHasReadyQueueDepthBaseline = false; + mMinReadyQueueDepth = 0; + mMaxReadyQueueDepth = 0; + mReadyQueueZeroDepthCount = 0; + mOutputRenderMilliseconds = 0.0; + mSmoothedOutputRenderMilliseconds = 0.0; + mMaxOutputRenderMilliseconds = 0.0; + mOutputFrameAcquireMilliseconds = 0.0; + mOutputFrameRenderRequestMilliseconds = 0.0; + mOutputFrameEndAccessMilliseconds = 0.0; + mLastLateStreak = 0; + mLastDropStreak = 0; + mOutputCompletionWorkerStopping = false; + mOutputCompletionWorkerRunning = true; + mOutputCompletionWorker = std::thread(&VideoBackend::OutputCompletionWorkerMain, this); + } } void VideoBackend::StopOutputCompletionWorker() { + StopOutputProducerWorker(); + bool shouldJoin = false; { std::lock_guard lock(mOutputCompletionMutex); @@ -322,6 +344,40 @@ void VideoBackend::StopOutputCompletionWorker() mOutputCompletionWorker.join(); } +void VideoBackend::StartOutputProducerWorker() +{ + std::lock_guard lock(mOutputProducerMutex); + if (mOutputProducerWorkerRunning) + return; + + mLastOutputProductionCompletion = VideoIOCompletion(); + mLastOutputProductionTime = std::chrono::steady_clock::time_point(); + mOutputProducerWorkerStopping = false; + mOutputProducerWorkerRunning = true; + mOutputProducerWorker = std::thread(&VideoBackend::OutputProducerWorkerMain, this); + mOutputProducerCondition.notify_one(); +} + +void VideoBackend::StopOutputProducerWorker() +{ + bool shouldJoin = false; + { + std::lock_guard lock(mOutputProducerMutex); + if (mOutputProducerWorkerRunning) + mOutputProducerWorkerStopping = true; + shouldJoin = mOutputProducerWorker.joinable(); + } + mOutputProducerCondition.notify_one(); + + if (shouldJoin) + mOutputProducerWorker.join(); +} + +void VideoBackend::NotifyOutputProducer() +{ + mOutputProducerCondition.notify_one(); +} + void VideoBackend::OutputCompletionWorkerMain() { for (;;) @@ -351,6 +407,57 @@ void VideoBackend::OutputCompletionWorkerMain() } } +void VideoBackend::OutputProducerWorkerMain() +{ + for (;;) + { + { + std::unique_lock lock(mOutputProducerMutex); + mOutputProducerCondition.wait_for(lock, OutputProducerWakeInterval()); + + if (mOutputProducerWorkerStopping) + { + mOutputProducerWorkerRunning = false; + return; + } + } + + const RenderOutputQueueMetrics metrics = mReadyOutputQueue.GetMetrics(); + RecordReadyQueueDepthSample(metrics); + const OutputProductionDecision decision = mOutputProductionController.Decide(BuildOutputProductionPressure(metrics)); + if (decision.action != OutputProductionAction::Produce || decision.requestedFrames == 0) + continue; + + VideoIOCompletion completion; + { + std::lock_guard lock(mOutputProducerMutex); + if (mOutputProducerWorkerStopping) + continue; + completion = mLastOutputProductionCompletion; + } + + const auto now = std::chrono::steady_clock::now(); + if (mLastOutputProductionTime != std::chrono::steady_clock::time_point() && + now - mLastOutputProductionTime < OutputProducerWakeInterval()) + { + continue; + } + + if (ProduceReadyOutputFrames(completion, 1) > 0) + mLastOutputProductionTime = std::chrono::steady_clock::now(); + } +} + +std::chrono::milliseconds VideoBackend::OutputProducerWakeInterval() const +{ + const double frameBudgetMilliseconds = State().frameBudgetMilliseconds; + if (frameBudgetMilliseconds <= 0.0) + return std::chrono::milliseconds(8); + + const int intervalMilliseconds = (std::max)(1, static_cast(std::floor(frameBudgetMilliseconds * 0.75))); + return std::chrono::milliseconds(intervalMilliseconds); +} + void VideoBackend::ProcessOutputFrameCompletion(const VideoIOCompletion& completion) { RecordFramePacing(completion.result); @@ -358,31 +465,69 @@ void VideoBackend::ProcessOutputFrameCompletion(const VideoIOCompletion& complet const RenderOutputQueueMetrics initialQueueMetrics = mReadyOutputQueue.GetMetrics(); RecordReadyQueueDepthSample(initialQueueMetrics); const VideoPlayoutRecoveryDecision recoveryDecision = AccountForCompletionResult(completion.result, initialQueueMetrics.depth); + { + std::lock_guard lock(mOutputMetricsMutex); + mLastLateStreak = recoveryDecision.lateStreak; + mLastDropStreak = recoveryDecision.dropStreak; + } + { + std::lock_guard lock(mOutputProducerMutex); + mLastOutputProductionCompletion = completion; + } + NotifyOutputProducer(); - FillReadyOutputQueue(completion); - if (!ScheduleReadyOutputFrame()) + if (!ScheduleReadyOutputFrame() && + (ProduceReadyOutputFrames(completion, 1) == 0 || !ScheduleReadyOutputFrame())) + { ScheduleBlackUnderrunFrame(); + } + NotifyOutputProducer(); RecordBackendPlayoutHealth(completion.result, recoveryDecision); } void VideoBackend::RecordBackendPlayoutHealth(VideoIOCompletionResult result, const VideoPlayoutRecoveryDecision& recoveryDecision) { const RenderOutputQueueMetrics queueMetrics = mReadyOutputQueue.GetMetrics(); + std::size_t minReadyQueueDepth = 0; + std::size_t maxReadyQueueDepth = 0; + uint64_t readyQueueZeroDepthCount = 0; + double outputRenderMilliseconds = 0.0; + double smoothedOutputRenderMilliseconds = 0.0; + double maxOutputRenderMilliseconds = 0.0; + double outputFrameAcquireMilliseconds = 0.0; + double outputFrameRenderRequestMilliseconds = 0.0; + double outputFrameEndAccessMilliseconds = 0.0; + { + std::lock_guard lock(mOutputMetricsMutex); + minReadyQueueDepth = mMinReadyQueueDepth; + maxReadyQueueDepth = mMaxReadyQueueDepth; + readyQueueZeroDepthCount = mReadyQueueZeroDepthCount; + outputRenderMilliseconds = mOutputRenderMilliseconds; + smoothedOutputRenderMilliseconds = mSmoothedOutputRenderMilliseconds; + maxOutputRenderMilliseconds = mMaxOutputRenderMilliseconds; + outputFrameAcquireMilliseconds = mOutputFrameAcquireMilliseconds; + outputFrameRenderRequestMilliseconds = mOutputFrameRenderRequestMilliseconds; + outputFrameEndAccessMilliseconds = mOutputFrameEndAccessMilliseconds; + } + mHealthTelemetry.TryRecordBackendPlayoutHealth( VideoBackendLifecycle::StateName(mLifecycle.State()), CompletionResultName(result), queueMetrics.depth, queueMetrics.capacity, queueMetrics.pushedCount, - mMinReadyQueueDepth, - mMaxReadyQueueDepth, - mReadyQueueZeroDepthCount, + minReadyQueueDepth, + maxReadyQueueDepth, + readyQueueZeroDepthCount, queueMetrics.poppedCount, queueMetrics.droppedCount, queueMetrics.underrunCount, - mOutputRenderMilliseconds, - mSmoothedOutputRenderMilliseconds, - mMaxOutputRenderMilliseconds, + outputRenderMilliseconds, + smoothedOutputRenderMilliseconds, + maxOutputRenderMilliseconds, + outputFrameAcquireMilliseconds, + outputFrameRenderRequestMilliseconds, + outputFrameEndAccessMilliseconds, recoveryDecision.completedFrameIndex, recoveryDecision.scheduledFrameIndex, recoveryDecision.scheduledLeadFrames, @@ -397,50 +542,93 @@ void VideoBackend::RecordBackendPlayoutHealth(VideoIOCompletionResult result, co StatusMessage()); } -bool VideoBackend::FillReadyOutputQueue(const VideoIOCompletion& completion) +std::size_t VideoBackend::ProduceReadyOutputFrames(const VideoIOCompletion& completion, std::size_t maxFrames) { + if (maxFrames == 0) + return 0; + + std::lock_guard productionLock(mOutputProductionMutex); RenderOutputQueueMetrics metrics = mReadyOutputQueue.GetMetrics(); - bool filledAny = false; - while (metrics.depth < mPlayoutPolicy.targetReadyFrames) + std::size_t producedFrames = 0; + while (producedFrames < maxFrames) { + const OutputProductionDecision decision = mOutputProductionController.Decide(BuildOutputProductionPressure(metrics)); + if (decision.action != OutputProductionAction::Produce) + break; + if (!RenderReadyOutputFrame(mVideoIODevice->State(), completion)) - return filledAny; - filledAny = true; + break; + ++producedFrames; metrics = mReadyOutputQueue.GetMetrics(); RecordReadyQueueDepthSample(metrics); } - return true; + return producedFrames; +} + +OutputProductionPressure VideoBackend::BuildOutputProductionPressure(const RenderOutputQueueMetrics& metrics) const +{ + OutputProductionPressure pressure; + pressure.readyQueueDepth = metrics.depth; + pressure.readyQueueCapacity = metrics.capacity; + pressure.readyQueueUnderrunCount = metrics.underrunCount; + { + std::lock_guard lock(mOutputMetricsMutex); + pressure.lateStreak = mLastLateStreak; + pressure.dropStreak = mLastDropStreak; + } + return pressure; } bool VideoBackend::RenderReadyOutputFrame(const VideoIOState& state, const VideoIOCompletion& completion) { const auto renderStart = std::chrono::steady_clock::now(); VideoIOOutputFrame outputFrame; + const auto acquireStart = std::chrono::steady_clock::now(); if (!BeginOutputFrame(outputFrame)) return false; + const auto acquireEnd = std::chrono::steady_clock::now(); bool rendered = true; + const auto renderRequestStart = std::chrono::steady_clock::now(); if (mBridge) rendered = mBridge->RenderScheduledFrame(state, completion, outputFrame); + const auto renderRequestEnd = std::chrono::steady_clock::now(); + const auto endAccessStart = std::chrono::steady_clock::now(); EndOutputFrame(outputFrame); + const auto endAccessEnd = std::chrono::steady_clock::now(); + const double acquireMilliseconds = std::chrono::duration_cast>(acquireEnd - acquireStart).count(); + const double renderRequestMilliseconds = std::chrono::duration_cast>(renderRequestEnd - renderRequestStart).count(); + const double endAccessMilliseconds = std::chrono::duration_cast>(endAccessEnd - endAccessStart).count(); + if (!rendered) { ApplyLifecycleTransition(VideoBackendLifecycleState::Degraded, "Output frame render request failed; skipping schedule for this frame."); const double renderMilliseconds = std::chrono::duration_cast>( std::chrono::steady_clock::now() - renderStart).count(); - RecordOutputRenderDuration(renderMilliseconds); + RecordOutputRenderDuration(renderMilliseconds, acquireMilliseconds, renderRequestMilliseconds, endAccessMilliseconds); + if (outputFrame.nativeFrame != nullptr) + { + static_cast(outputFrame.nativeFrame)->Release(); + outputFrame.nativeFrame = nullptr; + } return false; } const double renderMilliseconds = std::chrono::duration_cast>( std::chrono::steady_clock::now() - renderStart).count(); - RecordOutputRenderDuration(renderMilliseconds); + RecordOutputRenderDuration(renderMilliseconds, acquireMilliseconds, renderRequestMilliseconds, endAccessMilliseconds); RenderOutputFrame readyFrame; readyFrame.frame = outputFrame; readyFrame.frameIndex = ++mNextReadyOutputFrameIndex; - return mReadyOutputQueue.Push(readyFrame); + const bool pushed = mReadyOutputQueue.Push(readyFrame); + if (!pushed && outputFrame.nativeFrame != nullptr) + { + static_cast(outputFrame.nativeFrame)->Release(); + outputFrame.nativeFrame = nullptr; + } + return pushed; } bool VideoBackend::ScheduleReadyOutputFrame() @@ -516,6 +704,7 @@ void VideoBackend::RecordFramePacing(VideoIOCompletionResult completionResult) void VideoBackend::RecordReadyQueueDepthSample(const RenderOutputQueueMetrics& metrics) { + std::lock_guard lock(mOutputMetricsMutex); if (!mHasReadyQueueDepthBaseline) { mHasReadyQueueDepthBaseline = true; @@ -532,14 +721,18 @@ void VideoBackend::RecordReadyQueueDepthSample(const RenderOutputQueueMetrics& m ++mReadyQueueZeroDepthCount; } -void VideoBackend::RecordOutputRenderDuration(double renderMilliseconds) +void VideoBackend::RecordOutputRenderDuration(double renderMilliseconds, double acquireMilliseconds, double renderRequestMilliseconds, double endAccessMilliseconds) { + std::lock_guard lock(mOutputMetricsMutex); mOutputRenderMilliseconds = (std::max)(renderMilliseconds, 0.0); if (mSmoothedOutputRenderMilliseconds <= 0.0) mSmoothedOutputRenderMilliseconds = mOutputRenderMilliseconds; else mSmoothedOutputRenderMilliseconds = mSmoothedOutputRenderMilliseconds * 0.9 + mOutputRenderMilliseconds * 0.1; mMaxOutputRenderMilliseconds = (std::max)(mMaxOutputRenderMilliseconds, mOutputRenderMilliseconds); + mOutputFrameAcquireMilliseconds = (std::max)(acquireMilliseconds, 0.0); + mOutputFrameRenderRequestMilliseconds = (std::max)(renderRequestMilliseconds, 0.0); + mOutputFrameEndAccessMilliseconds = (std::max)(endAccessMilliseconds, 0.0); PublishTimingSample("VideoBackend", "outputRender", mOutputRenderMilliseconds, "ms"); PublishTimingSample("VideoBackend", "smoothedOutputRender", mSmoothedOutputRenderMilliseconds, "ms"); diff --git a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h index 708daf8..cf550b2 100644 --- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h +++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h @@ -1,5 +1,6 @@ #pragma once +#include "OutputProductionController.h" #include "RenderOutputQueue.h" #include "VideoBackendLifecycle.h" #include "VideoIOTypes.h" @@ -59,6 +60,7 @@ public: bool KeyerInterfaceAvailable() const; bool ExternalKeyingActive() const; const std::string& StatusMessage() const; + bool ShouldPrioritizeOutputOverPreview() const; void SetStatusMessage(const std::string& message); void PublishStatus(bool externalKeyingConfigured, const std::string& statusMessage = std::string()); void ReportNoInputDeviceSignalStatus(); @@ -69,14 +71,20 @@ private: void StartOutputCompletionWorker(); void StopOutputCompletionWorker(); void OutputCompletionWorkerMain(); + void StartOutputProducerWorker(); + void StopOutputProducerWorker(); + void OutputProducerWorkerMain(); + void NotifyOutputProducer(); + std::chrono::milliseconds OutputProducerWakeInterval() const; void ProcessOutputFrameCompletion(const VideoIOCompletion& completion); - bool FillReadyOutputQueue(const VideoIOCompletion& completion); + std::size_t ProduceReadyOutputFrames(const VideoIOCompletion& completion, std::size_t maxFrames); + OutputProductionPressure BuildOutputProductionPressure(const RenderOutputQueueMetrics& metrics) const; bool RenderReadyOutputFrame(const VideoIOState& state, const VideoIOCompletion& completion); bool ScheduleReadyOutputFrame(); bool ScheduleBlackUnderrunFrame(); void RecordFramePacing(VideoIOCompletionResult completionResult); void RecordReadyQueueDepthSample(const RenderOutputQueueMetrics& metrics); - void RecordOutputRenderDuration(double renderMilliseconds); + void RecordOutputRenderDuration(double renderMilliseconds, double acquireMilliseconds, double renderRequestMilliseconds, double endAccessMilliseconds); bool ApplyLifecycleTransition(VideoBackendLifecycleState state, const std::string& message); bool ApplyLifecycleFailure(const std::string& message); void PublishBackendStateChanged(const std::string& state, const std::string& message); @@ -92,6 +100,7 @@ private: RuntimeEventDispatcher& mRuntimeEventDispatcher; VideoBackendLifecycle mLifecycle; VideoPlayoutPolicy mPlayoutPolicy; + OutputProductionController mOutputProductionController; RenderOutputQueue mReadyOutputQueue; std::unique_ptr mVideoIODevice; std::unique_ptr mBridge; @@ -99,8 +108,17 @@ private: std::condition_variable mOutputCompletionCondition; std::deque mPendingOutputCompletions; std::thread mOutputCompletionWorker; + std::mutex mOutputProducerMutex; + std::condition_variable mOutputProducerCondition; + std::thread mOutputProducerWorker; + VideoIOCompletion mLastOutputProductionCompletion; + std::chrono::steady_clock::time_point mLastOutputProductionTime; + std::mutex mOutputProductionMutex; + mutable std::mutex mOutputMetricsMutex; bool mOutputCompletionWorkerRunning = false; bool mOutputCompletionWorkerStopping = false; + bool mOutputProducerWorkerRunning = false; + bool mOutputProducerWorkerStopping = false; uint64_t mNextReadyOutputFrameIndex = 0; uint64_t mInputFrameIndex = 0; uint64_t mOutputFrameScheduleIndex = 0; @@ -121,6 +139,11 @@ private: double mOutputRenderMilliseconds = 0.0; double mSmoothedOutputRenderMilliseconds = 0.0; double mMaxOutputRenderMilliseconds = 0.0; + double mOutputFrameAcquireMilliseconds = 0.0; + double mOutputFrameRenderRequestMilliseconds = 0.0; + double mOutputFrameEndAccessMilliseconds = 0.0; + uint64_t mLastLateStreak = 0; + uint64_t mLastDropStreak = 0; uint64_t mLateFrameCount = 0; uint64_t mDroppedFrameCount = 0; uint64_t mFlushedFrameCount = 0; diff --git a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h index 47db122..ce869cf 100644 --- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h +++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h @@ -30,5 +30,8 @@ inline VideoPlayoutPolicy NormalizeVideoPlayoutPolicy(VideoPlayoutPolicy policy) policy.targetReadyFrames = 1; if (policy.maxReadyFrames < policy.targetReadyFrames) policy.maxReadyFrames = policy.targetReadyFrames; + const unsigned minimumOutputFramePoolSize = policy.targetPrerollFrames + policy.maxReadyFrames + policy.minimumSpareDeviceFrames; + if (policy.outputFramePoolSize < minimumOutputFramePoolSize) + policy.outputFramePoolSize = minimumOutputFramePoolSize; return policy; } diff --git a/apps/LoopThroughWithOpenGLCompositing/videoio/decklink/DeckLinkSession.cpp b/apps/LoopThroughWithOpenGLCompositing/videoio/decklink/DeckLinkSession.cpp index 2b31217..28ded1b 100644 --- a/apps/LoopThroughWithOpenGLCompositing/videoio/decklink/DeckLinkSession.cpp +++ b/apps/LoopThroughWithOpenGLCompositing/videoio/decklink/DeckLinkSession.cpp @@ -423,7 +423,6 @@ bool DeckLinkSession::AcquireNextOutputVideoFrame(CComPtrAddRef(); frame.nativeFrame = outputVideoFrame; frame.nativeBuffer = outputVideoFrameBuffer.Detach(); return true; @@ -506,7 +506,10 @@ VideoPlayoutRecoveryDecision DeckLinkSession::AccountForCompletionResult(VideoIO bool DeckLinkSession::ScheduleOutputFrame(const VideoIOOutputFrame& frame) { IDeckLinkMutableVideoFrame* outputVideoFrame = static_cast(frame.nativeFrame); - return ScheduleFrame(outputVideoFrame); + const bool scheduled = ScheduleFrame(outputVideoFrame); + if (outputVideoFrame != nullptr) + outputVideoFrame->Release(); + return scheduled; } bool DeckLinkSession::Start() @@ -616,8 +619,18 @@ void DeckLinkSession::HandleVideoInputFrame(IDeckLinkVideoInputFrame* inputFrame inputFrameBuffer->EndAccess(bmdBufferAccessRead); } -void DeckLinkSession::HandlePlayoutFrameCompleted(IDeckLinkVideoFrame*, BMDOutputFrameCompletionResult completionResult) +void DeckLinkSession::HandlePlayoutFrameCompleted(IDeckLinkVideoFrame* completedFrame, BMDOutputFrameCompletionResult completionResult) { + if (completedFrame != nullptr) + { + CComPtr reusableFrame; + if (completedFrame->QueryInterface(IID_IDeckLinkMutableVideoFrame, reinterpret_cast(&reusableFrame)) == S_OK && + reusableFrame != nullptr) + { + outputVideoFrameQueue.push_back(reusableFrame); + } + } + if (!mOutputFrameCallback) return; diff --git a/docs/PHASE_7_5_PROACTIVE_PLAYOUT_TIMING_DESIGN.md b/docs/PHASE_7_5_PROACTIVE_PLAYOUT_TIMING_DESIGN.md index 6a21c7e..58dc662 100644 --- a/docs/PHASE_7_5_PROACTIVE_PLAYOUT_TIMING_DESIGN.md +++ b/docs/PHASE_7_5_PROACTIVE_PLAYOUT_TIMING_DESIGN.md @@ -7,7 +7,7 @@ Phase 7 made backend lifecycle, playout policy, ready-frame queueing, late/drop ## Status - Phase 7.5 design package: proposed. -- Phase 7.5 implementation: Step 2 in progress. +- Phase 7.5 implementation: Step 5 in progress. - Current alignment: Phase 7 is complete. `RenderOutputQueue`, `VideoPlayoutPolicy`, `VideoPlayoutScheduler`, `VideoBackendLifecycle`, and backend playout telemetry exist. The backend worker fills the ready queue on completion demand, but render production is not yet proactively driven by queue pressure or video cadence. Current footholds: @@ -19,6 +19,9 @@ Current footholds: - `HealthTelemetry::BackendPlayoutSnapshot` exposes queue depth, underruns, late/drop streaks, and recovery decisions. - Step 1 adds baseline timing fields for ready-queue min/max/zero-depth samples and output render duration. - Step 2 adds a pure `OutputProductionController` for queue-pressure production decisions. +- Step 3 adds a proactive output producer worker that keeps `RenderOutputQueue` warm after playback starts. +- Step 4 skips non-forced preview presentation while output ready-queue depth is below target. +- Step 5 makes async readback misses prefer cached output over synchronous readback after bootstrap. ## Timing Review Findings @@ -199,15 +202,23 @@ Move from demand-filled output production to queue-pressure production. Initial target: -- producer wakes when queue depth is below target -- producer requests render-thread output production until target depth is reached -- producer stops when backend stops or render thread shuts down -- completion worker mostly schedules from already-ready frames +- [x] producer wakes when queue depth is below target +- [x] producer requests render-thread output production until target depth is reached +- [x] producer stops when backend stops or render thread shuts down +- [x] completion worker mostly schedules from already-ready frames Exit criteria: -- normal playback does not depend on completion processing to fill the queue from empty -- callback/completion pressure and render production pressure are separate +- [x] normal playback does not depend on completion processing to fill the queue from empty +- [x] callback/completion pressure and render production pressure are separate + +Implementation notes: + +- `VideoBackend` starts the completion worker before device start, then starts the output producer only after DeckLink start succeeds. This avoids fighting DeckLink preroll for the same output frame pool. +- `OutputProducerWorkerMain()` periodically wakes and uses `OutputProductionController` to decide whether to produce, wait, or throttle. +- Completion handling records pacing/recovery, updates producer pressure, schedules a ready frame, and wakes the producer to refill headroom. +- Completion handling keeps a one-frame synchronous fallback when the ready queue is unexpectedly empty, then falls back to black underrun behavior if that also fails. +- Producer shutdown is explicit and joined before video output teardown. ### Step 4. Prioritize Playout Over Preview @@ -215,15 +226,21 @@ Make preview explicitly subordinate to output playout deadlines. Initial target: -- skip or delay preview when ready queue depth is below target +- [x] skip or delay preview when ready queue depth is below target - count skipped previews - record preview present cost separately from output render cost Exit criteria: -- preview cannot drain output headroom invisibly +- [x] preview cannot drain output headroom invisibly - runtime telemetry shows preview skips and preview present cost +Implementation notes: + +- `OpenGLComposite::paintGL(false)` now skips preview presentation when `VideoBackend` reports that the ready queue is below the target depth. +- Forced preview paints are still allowed so resize/manual paint behavior remains intact. +- Preview skip counters and present-cost telemetry remain follow-up work for this step. + ### Step 5. Make Readback Miss Policy Deadline-Aware Avoid turning a late async readback fence into synchronous deadline pressure by default. @@ -232,13 +249,20 @@ Initial target: - count async readback misses - count synchronous fallback uses -- allow policy to prefer stale/black output over synchronous fallback when queue pressure is high -- keep current fallback available while behavior is measured +- [x] allow policy to prefer stale/black output over synchronous fallback when queue pressure is high +- [x] keep current fallback available while behavior is measured Exit criteria: -- readback fallback is an explicit policy decision -- late GPU fences do not automatically block the most timing-sensitive path +- [x] readback fallback is an explicit policy decision +- [x] late GPU fences do not automatically block the most timing-sensitive path + +Implementation notes: + +- `OpenGLRenderPipeline::ReadOutputFrame()` now uses synchronous readback only to bootstrap the first cached output frame. +- After cached output exists, an async readback miss copies the cached output frame into the DeckLink output frame instead of blocking on synchronous `glReadPixels`. +- Async readback queueing now skips when the next PBO slot is still in flight rather than deleting an in-flight fence and overwriting it. +- Miss/fallback counters remain follow-up telemetry work for this step. ### Step 6. Tune Headroom Policy diff --git a/docs/PHASE_7_5_READBACK_EXPERIMENT_LOG.md b/docs/PHASE_7_5_READBACK_EXPERIMENT_LOG.md new file mode 100644 index 0000000..a4720ee --- /dev/null +++ b/docs/PHASE_7_5_READBACK_EXPERIMENT_LOG.md @@ -0,0 +1,165 @@ +# Phase 7.5 Readback Experiment Log + +This log tracks short readback experiments during the proactive playout timing work. + +## How To Run + +The default debugger launch keeps the current production path: + +- `Debug LoopThroughWithOpenGLCompositing` +- `VST_OUTPUT_READBACK_MODE` unset +- mode: `async_pbo` + +Comparison modes are still available: + +- `VST_OUTPUT_READBACK_MODE=async_pbo` +- uses the older PBO/fence readback path + +The experiment launches are: + +- `Debug LoopThroughWithOpenGLCompositing - sync readback experiment` +- `VST_OUTPUT_READBACK_MODE=sync` +- uses direct synchronous `glReadPixels()` every output frame + +- `Debug LoopThroughWithOpenGLCompositing - cached output experiment` +- `VST_OUTPUT_READBACK_MODE=cached_only` +- uses one bootstrap synchronous readback, then copies the cached output frame without ongoing GPU readback + +The cached-output experiment is not visually correct for live motion. It exists to test whether removing ongoing GPU readback lets the producer fill the ready queue again. + +## Experiment 3: fast_transfer + +Status: removed from active code after hardware sample + +Date: 2026-05-11 + +Change: + +- DeckLink output frames are now created with `CreateVideoFrameWithBuffer()`. +- Output frame buffers are owned by `PinnedMemoryAllocator`. +- `VideoIOOutputFrame` carries a texture-transfer callback. +- The test branch changed the default render readback path to try `VideoFrameTransfer::GPUtoCPU` against the output texture for BGRA output. +- If fast transfer is unavailable or fails, the code falls back to cached output if present, then synchronous readback as a safety fallback. + +Question: + +Can SDK-style pinned/DVP transfer recover real rendered output timing without the visually-invalid cached-only shortcut? + +Result: + +- The test machine reported `GL_VENDOR=NVIDIA Corporation` and `GL_RENDERER=NVIDIA GeForce RTX 4060 Ti/PCIe/SSE2`. +- The DeckLink SDK OpenGL fast-transfer sample gates NVIDIA DVP on `GL_RENDERER` containing `Quadro`. +- `GL_AMD_pinned_memory` was also unavailable. +- The fast-transfer path was removed from active code to avoid carrying unsupported DVP dependencies while we investigate CPU-frame buffering and render-ahead. + +## Baseline: async_pbo + +Date: 2026-05-11 + +Observed while the app was running after adding the async queue split counters. + +Summary: + +- ready queue was pinned at 0 or briefly 1 +- underrun, zero-depth, late, and dropped counts increased continuously +- `renderRequestMs` usually sat around 16-25 ms, with occasional larger spikes +- `asyncQueueMs` was mostly explained by `asyncQueueReadPixelsMs` +- PBO allocation/orphaning was effectively 0 ms + +Representative samples: + +| readyDepth | renderRequestMs | queueWaitMs | drawMs | mapMs | copyMs | asyncQueueMs | asyncQueueBufferMs | asyncQueueReadPixelsMs | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| 0 | 24.915 | 3.018 | 0.510 | 0.923 | 0.768 | 9.018 | 0.000 | 9.001 | +| 0 | 16.226 | 3.066 | 0.518 | 1.202 | 0.812 | 8.611 | 0.000 | 8.598 | +| 0 | 12.134 | 3.796 | 3.579 | 1.378 | 0.690 | 10.323 | 0.000 | 10.311 | +| 0 | 17.496 | 2.817 | 0.523 | 1.267 | 1.160 | 9.416 | 0.000 | 9.403 | + +Initial read: + +The main repeated cost is issuing `glReadPixels(..., nullptr)` into the PBO. `glBufferData`, setup, fence creation, fence wait, map, and CPU copy are not large enough to explain the underruns. + +## Experiment 1: sync + +Status: sampled + +Question: + +Does the direct synchronous readback path perform better or worse than the current PBO path on this machine and DeckLink format? + +Expected interpretation: + +- If `syncReadMs` is lower than `asyncQueueReadPixelsMs` and the ready queue improves, the current PBO path is the wrong strategy for this driver/format. +- If `syncReadMs` is also high and the ready queue remains empty, any GPU-to-CPU readback in this path is too expensive for the current producer cadence. + +Results: + +Date: 2026-05-11 + +Summary: + +- ready queue remained pinned at 0 +- underrun, zero-depth, late, and dropped counts continued increasing +- `asyncQueueMs` and async readback counters were 0, confirming the experiment mode was active +- direct `syncReadMs` was generally worse than the baseline PBO `asyncQueueReadPixelsMs` + +Representative samples: + +| readyDepth | renderRequestMs | queueWaitMs | drawMs | syncReadMs | asyncQueueMs | syncFallbackCount | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| 0 | 32.467 | 5.764 | 1.389 | 23.122 | 0.000 | 680 | +| 0 | 29.722 | 2.603 | 0.512 | 25.538 | 0.000 | 697 | +| 0 | 37.844 | 7.716 | 0.518 | 23.608 | 0.000 | 706 | +| 0 | 22.304 | 3.089 | 1.843 | 15.278 | 0.000 | 723 | +| 0 | 27.196 | 4.015 | 0.500 | 21.933 | 0.000 | 736 | + +Read: + +Direct synchronous readback does not recover the queue and is slower than the async PBO path on the sampled run. The bottleneck appears to be GPU-to-CPU readback itself, not PBO orphaning or fence handling. + +## Experiment 2: cached_only + +Status: sampled + +Question: + +If ongoing GPU readback is removed after bootstrap, can the producer keep the ready queue above 0? + +Expected interpretation: + +- If ready depth rises and underruns slow or stop, readback is the primary bottleneck. +- If ready depth still stays near 0, the bottleneck is elsewhere in scheduling, frame acquisition, queueing, or DeckLink handoff. + +Results: + +Date: 2026-05-11 + +User-visible result: + +- DeckLink reported a healthy 5-frame buffer. + +Telemetry summary: + +- `renderRequestMs` dropped to roughly 1-3 ms. +- `cachedCopyMs` was usually around 0.8-1.0 ms, with one sampled low value around 0.37 ms. +- `asyncQueueMs`, `asyncQueueReadPixelsMs`, `syncReadMs`, fence wait, map, and async copy were 0 after bootstrap. +- `syncFallbackCount` stayed at 1, confirming one bootstrap readback. +- `cachedFallbackCount` increased continuously, confirming ongoing frames were served from cached CPU memory. +- late and dropped counts were 0 during the sampled run. +- internal ready queue depth still reported mostly 0-1 even while DeckLink showed a healthy hardware/device buffer. + +Representative samples: + +| readyDepth | renderRequestMs | queueWaitMs | drawMs | cachedCopyMs | asyncQueueMs | syncReadMs | late | dropped | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| 0 | 1.446 | 0.018 | 0.518 | 0.864 | 0.000 | 0.000 | 0 | 0 | +| 0 | 2.586 | 1.089 | 0.514 | 0.829 | 0.000 | 0.000 | 0 | 0 | +| 0 | 1.481 | 2.378 | 0.502 | 0.911 | 0.000 | 0.000 | 0 | 0 | +| 0 | 0.892 | 0.013 | 0.468 | 0.371 | 0.000 | 0.000 | 0 | 0 | +| 1 | 1.398 | 0.019 | 0.483 | 0.819 | 0.000 | 0.000 | 0 | 0 | + +Read: + +Removing ongoing GPU readback recovers output timing immediately. The direct cause of the Phase 7.5 playback collapse is the per-frame GPU-to-CPU readback cost, not DeckLink frame acquisition, output frame end-access, PBO allocation, fence waiting, or CPU copy. + +The internal ready queue depth still being low while DeckLink reports a healthy device buffer suggests the ready queue is acting as a short staging queue rather than the full device playout buffer. For the next fix, prioritize avoiding a blocking readback on every output frame instead of only increasing internal ready queue depth. diff --git a/docs/openapi.yaml b/docs/openapi.yaml index a067507..7e7858a 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -363,6 +363,10 @@ components: $ref: "#/components/schemas/VideoIOStatus" performance: $ref: "#/components/schemas/PerformanceStatus" + backendPlayout: + $ref: "#/components/schemas/BackendPlayoutStatus" + runtimeEvents: + $ref: "#/components/schemas/RuntimeEventStatus" shaders: type: array items: @@ -382,10 +386,16 @@ components: type: number oscPort: type: number + oscBindAddress: + type: string + oscSmoothing: + type: number autoReload: type: boolean maxTemporalHistoryFrames: type: number + previewFps: + type: number enableExternalKeying: type: boolean inputVideoFormat: @@ -478,6 +488,175 @@ components: type: number flushedFrameCount: type: number + BackendPlayoutStatus: + type: object + properties: + lifecycleState: + type: string + example: running + degraded: + type: boolean + statusMessage: + type: string + lateFrameCount: + type: number + droppedFrameCount: + type: number + flushedFrameCount: + type: number + readyQueue: + $ref: "#/components/schemas/BackendReadyQueueStatus" + outputRender: + $ref: "#/components/schemas/BackendOutputRenderStatus" + recovery: + $ref: "#/components/schemas/BackendPlayoutRecoveryStatus" + BackendReadyQueueStatus: + type: object + properties: + depth: + type: number + description: Current number of ready output frames. + capacity: + type: number + description: Maximum ready output frames currently allowed. + minDepth: + type: number + description: Minimum observed ready queue depth since backend worker start. + maxDepth: + type: number + description: Maximum observed ready queue depth since backend worker start. + zeroDepthCount: + type: number + description: Number of observed samples where the ready queue was empty. + pushedCount: + type: number + poppedCount: + type: number + droppedCount: + type: number + underrunCount: + type: number + BackendOutputRenderStatus: + type: object + properties: + renderMs: + type: number + description: Most recent output render duration in milliseconds. + smoothedRenderMs: + type: number + description: Smoothed output render duration in milliseconds. + maxRenderMs: + type: number + description: Maximum observed output render duration in milliseconds. + acquireFrameMs: + type: number + description: Time spent acquiring a writable backend output frame in milliseconds. + renderRequestMs: + type: number + description: Time spent executing the render-thread output frame request in milliseconds. + endAccessMs: + type: number + description: Time spent ending write access to the backend output frame in milliseconds. + queueWaitMs: + type: number + description: Time the output render request spent waiting for the render thread in milliseconds. + drawMs: + type: number + description: Time spent drawing, blitting, packing, and flushing the output frame in milliseconds. + fenceWaitMs: + type: number + description: Time spent waiting for the async readback fence in milliseconds. + mapMs: + type: number + description: Time spent mapping the async readback pixel buffer in milliseconds. + readbackCopyMs: + type: number + description: Time spent copying async readback bytes into the backend output frame in milliseconds. + cachedCopyMs: + type: number + description: Time spent copying the cached output frame when async readback is not ready in milliseconds. + asyncQueueMs: + type: number + description: Time spent queueing the next async readback in milliseconds. + asyncQueueBufferMs: + type: number + description: Time spent orphaning or allocating the async readback pixel buffer in milliseconds. + asyncQueueSetupMs: + type: number + description: Time spent applying readback pixel-store, framebuffer, and pixel-pack-buffer state in milliseconds. + asyncQueueReadPixelsMs: + type: number + description: Time spent issuing glReadPixels for the async readback in milliseconds. + asyncQueueFenceMs: + type: number + description: Time spent creating the async readback fence in milliseconds. + syncReadMs: + type: number + description: Time spent in bootstrap synchronous readback in milliseconds. + asyncReadbackMissCount: + type: number + description: Count of output render requests where async readback was not ready. + cachedFallbackCount: + type: number + description: Count of output render requests served from the cached output frame. + syncFallbackCount: + type: number + description: Count of output render requests that used bootstrap synchronous readback. + BackendPlayoutRecoveryStatus: + type: object + properties: + completionResult: + type: string + enum: [Completed, DisplayedLate, Dropped, Flushed, Unknown] + completedFrameIndex: + type: number + scheduledFrameIndex: + type: number + scheduledLeadFrames: + type: number + measuredLagFrames: + type: number + catchUpFrames: + type: number + lateStreak: + type: number + dropStreak: + type: number + RuntimeEventStatus: + type: object + properties: + queue: + $ref: "#/components/schemas/RuntimeEventQueueStatus" + dispatch: + $ref: "#/components/schemas/RuntimeEventDispatchStatus" + RuntimeEventQueueStatus: + type: object + properties: + name: + type: string + depth: + type: number + capacity: + type: number + droppedCount: + type: number + oldestEventAgeMs: + type: number + RuntimeEventDispatchStatus: + type: object + properties: + dispatchCallCount: + type: number + dispatchedEventCount: + type: number + handlerInvocationCount: + type: number + handlerFailureCount: + type: number + lastDispatchDurationMs: + type: number + maxDispatchDurationMs: + type: number ShaderSummary: type: object properties: @@ -497,6 +676,8 @@ components: description: Error text for unavailable shader packages. temporal: $ref: "#/components/schemas/TemporalState" + feedback: + $ref: "#/components/schemas/FeedbackState" TemporalState: type: object properties: @@ -509,6 +690,13 @@ components: type: number effectiveHistoryLength: type: number + FeedbackState: + type: object + properties: + enabled: + type: boolean + writePass: + type: string LayerState: type: object properties: diff --git a/tests/HealthTelemetryTests.cpp b/tests/HealthTelemetryTests.cpp index 989a4da..7bc1285 100644 --- a/tests/HealthTelemetryTests.cpp +++ b/tests/HealthTelemetryTests.cpp @@ -95,6 +95,9 @@ void TestBackendPlayoutHealth() 8.5, 7.25, 12.0, + 1.0, + 6.5, + 0.5, 8, 11, 3, @@ -121,6 +124,9 @@ void TestBackendPlayoutHealth() Expect(playout.outputRenderMilliseconds == 8.5, "backend playout health stores output render duration"); Expect(playout.smoothedOutputRenderMilliseconds == 7.25, "backend playout health stores smoothed output render duration"); Expect(playout.maxOutputRenderMilliseconds == 12.0, "backend playout health stores max output render duration"); + Expect(playout.outputFrameAcquireMilliseconds == 1.0, "backend playout health stores output frame acquire duration"); + Expect(playout.outputFrameRenderRequestMilliseconds == 6.5, "backend playout health stores output render request duration"); + Expect(playout.outputFrameEndAccessMilliseconds == 0.5, "backend playout health stores output frame end access duration"); Expect(playout.completedFrameIndex == 8, "backend playout health stores completed index"); Expect(playout.scheduledFrameIndex == 11, "backend playout health stores scheduled index"); Expect(playout.measuredLagFrames == 2, "backend playout health stores measured lag"); @@ -148,6 +154,9 @@ void TestBackendPlayoutHealth() -5.0, -4.0, -3.0, + -2.0, + -1.0, + -0.5, 9, 12, 3, @@ -165,6 +174,32 @@ void TestBackendPlayoutHealth() Expect(snapshot.backendPlayout.lifecycleState == "Running", "full health snapshot includes backend playout state"); Expect(!snapshot.backendPlayout.degraded, "full health snapshot includes backend degraded state"); } + +void TestOutputRenderPipelineTiming() +{ + HealthTelemetry telemetry; + telemetry.RecordOutputRenderQueueWait(2.5); + telemetry.RecordOutputRenderPipelineTiming(1.0, 0.5, 0.25, 0.75, 0.125, 0.375, 0.1, 0.2, 0.3, 0.4, 3.5, true, true, false); + Expect(telemetry.TryRecordOutputRenderPipelineTiming(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, false, false, true), + "try output render timing succeeds when uncontended"); + + const HealthTelemetry::BackendPlayoutSnapshot playout = telemetry.GetBackendPlayoutSnapshot(); + Expect(playout.outputRenderQueueWaitMilliseconds == 2.5, "output render timing stores queue wait"); + Expect(playout.outputRenderDrawMilliseconds == 0.0, "output render timing clamps draw duration"); + Expect(playout.outputReadbackFenceWaitMilliseconds == 0.0, "output render timing clamps fence wait duration"); + Expect(playout.outputReadbackMapMilliseconds == 0.0, "output render timing clamps map duration"); + Expect(playout.outputReadbackCopyMilliseconds == 0.0, "output render timing clamps readback copy duration"); + Expect(playout.outputCachedCopyMilliseconds == 0.0, "output render timing clamps cached copy duration"); + Expect(playout.outputAsyncQueueMilliseconds == 0.0, "output render timing clamps async queue duration"); + Expect(playout.outputAsyncQueueBufferMilliseconds == 0.0, "output render timing clamps async queue buffer duration"); + Expect(playout.outputAsyncQueueSetupMilliseconds == 0.0, "output render timing clamps async queue setup duration"); + Expect(playout.outputAsyncQueueReadPixelsMilliseconds == 0.0, "output render timing clamps async queue read pixels duration"); + Expect(playout.outputAsyncQueueFenceMilliseconds == 0.0, "output render timing clamps async queue fence duration"); + Expect(playout.outputSyncReadMilliseconds == 0.0, "output render timing clamps sync read duration"); + Expect(playout.outputAsyncReadbackMissCount == 1, "output render timing counts async readback misses"); + Expect(playout.outputCachedFallbackCount == 1, "output render timing counts cached fallbacks"); + Expect(playout.outputSyncFallbackCount == 1, "output render timing counts sync fallbacks"); +} } int main() @@ -174,6 +209,7 @@ int main() TestRuntimeEventTryRecord(); TestPersistenceWriteHealth(); TestBackendPlayoutHealth(); + TestOutputRenderPipelineTiming(); if (gFailures != 0) { diff --git a/tests/VideoPlayoutSchedulerTests.cpp b/tests/VideoPlayoutSchedulerTests.cpp index 957e21e..231c0d4 100644 --- a/tests/VideoPlayoutSchedulerTests.cpp +++ b/tests/VideoPlayoutSchedulerTests.cpp @@ -93,9 +93,10 @@ void TestPolicyNormalization() policy.maxReadyFrames = 2; VideoPlayoutPolicy normalized = NormalizeVideoPlayoutPolicy(policy); - Expect(normalized.outputFramePoolSize == 1, "policy normalization keeps at least one output frame"); Expect(normalized.targetPrerollFrames == 1, "policy normalization keeps at least one preroll frame"); Expect(normalized.maxReadyFrames == normalized.targetReadyFrames, "policy normalization keeps max ready frames above target"); + Expect(normalized.outputFramePoolSize >= normalized.targetPrerollFrames + normalized.maxReadyFrames + normalized.minimumSpareDeviceFrames, + "policy normalization keeps enough output frames for preroll and ready queue ownership"); } void TestFrameBudgets()