#include "OpenGLRenderPipeline.h" #include "HealthTelemetry.h" #include "OpenGLRenderer.h" #include "RuntimeSnapshotProvider.h" #include "VideoIOFormat.h" #include #include #include #include #include OpenGLRenderPipeline::OpenGLRenderPipeline( OpenGLRenderer& renderer, RuntimeSnapshotProvider& runtimeSnapshotProvider, HealthTelemetry& healthTelemetry, RenderEffectCallback renderEffect, OutputReadyCallback outputReady, PaintCallback paint) : mRenderer(renderer), mRuntimeSnapshotProvider(runtimeSnapshotProvider), mHealthTelemetry(healthTelemetry), mRenderEffect(renderEffect), mOutputReady(outputReady), mPaint(paint), mOutputReadbackMode(ReadOutputReadbackModeFromEnvironment()) { } OpenGLRenderPipeline::~OpenGLRenderPipeline() { ResetAsyncReadbackState(); } bool OpenGLRenderPipeline::RenderFrame(const RenderPipelineFrameContext& context, VideoIOOutputFrame& outputFrame) { const VideoIOState& state = context.videoState; const auto renderStartTime = std::chrono::steady_clock::now(); glBindFramebuffer(GL_FRAMEBUFFER, mRenderer.CompositeFramebuffer()); mRenderEffect(); glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.CompositeFramebuffer()); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, mRenderer.OutputFramebuffer()); glBlitFramebuffer(0, 0, state.inputFrameSize.width, state.inputFrameSize.height, 0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_COLOR_BUFFER_BIT, GL_LINEAR); glBindFramebuffer(GL_FRAMEBUFFER, mRenderer.OutputFramebuffer()); if (mOutputReady) mOutputReady(); if (state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10) PackOutputFor10Bit(state); glFlush(); const auto renderEndTime = std::chrono::steady_clock::now(); const double renderMilliseconds = std::chrono::duration_cast>(renderEndTime - renderStartTime).count(); mHealthTelemetry.TryRecordPerformanceStats(state.frameBudgetMilliseconds, renderMilliseconds); mRuntimeSnapshotProvider.AdvanceFrame(); OutputReadbackTiming readbackTiming = ReadOutputFrame(state, outputFrame); mHealthTelemetry.TryRecordOutputRenderPipelineTiming( renderMilliseconds, readbackTiming.fenceWaitMilliseconds, readbackTiming.mapMilliseconds, readbackTiming.copyMilliseconds, readbackTiming.cachedCopyMilliseconds, readbackTiming.asyncQueueMilliseconds, readbackTiming.asyncQueueBufferMilliseconds, readbackTiming.asyncQueueSetupMilliseconds, readbackTiming.asyncQueueReadPixelsMilliseconds, readbackTiming.asyncQueueFenceMilliseconds, readbackTiming.syncReadMilliseconds, readbackTiming.asyncReadbackMissed, readbackTiming.cachedFallbackUsed, readbackTiming.syncFallbackUsed); return true; } void OpenGLRenderPipeline::PackOutputFor10Bit(const VideoIOState& state) { glBindFramebuffer(GL_FRAMEBUFFER, mRenderer.OutputPackFramebuffer()); glViewport(0, 0, state.outputPackTextureWidth, state.outputFrameSize.height); glDisable(GL_SCISSOR_TEST); glDisable(GL_BLEND); glDisable(GL_DEPTH_TEST); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, mRenderer.OutputTexture()); glBindVertexArray(mRenderer.FullscreenVertexArray()); glUseProgram(mRenderer.OutputPackProgram()); const GLint outputResolutionLocation = mRenderer.OutputPackResolutionLocation(); const GLint activeWordsLocation = mRenderer.OutputPackActiveWordsLocation(); const GLint packFormatLocation = mRenderer.OutputPackFormatLocation(); if (outputResolutionLocation >= 0) glUniform2f(outputResolutionLocation, static_cast(state.outputFrameSize.width), static_cast(state.outputFrameSize.height)); if (activeWordsLocation >= 0) glUniform1f(activeWordsLocation, static_cast(ActiveV210WordsForWidth(state.outputFrameSize.width))); if (packFormatLocation >= 0) glUniform1i(packFormatLocation, state.outputPixelFormat == VideoIOPixelFormat::Yuva10 ? 2 : 1); glDrawArrays(GL_TRIANGLES, 0, 3); glUseProgram(0); glBindVertexArray(0); glBindTexture(GL_TEXTURE_2D, 0); } bool OpenGLRenderPipeline::EnsureAsyncReadbackBuffers(std::size_t requiredBytes) { if (requiredBytes == 0) return false; if (mAsyncReadbackBytes == requiredBytes && mAsyncReadbackSlots[0].pixelPackBuffer != 0) return true; ResetAsyncReadbackState(); mAsyncReadbackBytes = requiredBytes; for (AsyncReadbackSlot& slot : mAsyncReadbackSlots) { glGenBuffers(1, &slot.pixelPackBuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer); glBufferData(GL_PIXEL_PACK_BUFFER, static_cast(requiredBytes), nullptr, GL_STREAM_READ); slot.sizeBytes = requiredBytes; slot.inFlight = false; } glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); mAsyncReadbackWriteIndex = 0; mAsyncReadbackReadIndex = 0; return true; } void OpenGLRenderPipeline::ResetAsyncReadbackState() { FlushAsyncReadbackPipeline(); for (AsyncReadbackSlot& slot : mAsyncReadbackSlots) slot.sizeBytes = 0; if (mAsyncReadbackSlots[0].pixelPackBuffer != 0) { for (AsyncReadbackSlot& slot : mAsyncReadbackSlots) { if (slot.pixelPackBuffer != 0) { glDeleteBuffers(1, &slot.pixelPackBuffer); slot.pixelPackBuffer = 0; } } } mAsyncReadbackWriteIndex = 0; mAsyncReadbackReadIndex = 0; mAsyncReadbackBytes = 0; } void OpenGLRenderPipeline::FlushAsyncReadbackPipeline() { for (AsyncReadbackSlot& slot : mAsyncReadbackSlots) { if (slot.fence != nullptr) { glDeleteSync(slot.fence); slot.fence = nullptr; } slot.inFlight = false; } mAsyncReadbackWriteIndex = 0; mAsyncReadbackReadIndex = 0; } bool OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing) { const auto queueStartTime = std::chrono::steady_clock::now(); const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10; const std::size_t requiredBytes = static_cast(state.outputFrameRowBytes) * state.outputFrameSize.height; const GLenum format = usePackedOutput ? GL_RGBA : GL_BGRA; const GLenum type = usePackedOutput ? GL_UNSIGNED_BYTE : GL_UNSIGNED_INT_8_8_8_8_REV; const GLuint framebuffer = usePackedOutput ? mRenderer.OutputPackFramebuffer() : mRenderer.OutputFramebuffer(); const GLsizei readWidth = static_cast(usePackedOutput ? state.outputPackTextureWidth : state.outputFrameSize.width); const GLsizei readHeight = static_cast(state.outputFrameSize.height); const auto finishTiming = [&timing, queueStartTime]() { const auto queueEndTime = std::chrono::steady_clock::now(); timing.asyncQueueMilliseconds += std::chrono::duration_cast>(queueEndTime - queueStartTime).count(); }; if (requiredBytes == 0) { finishTiming(); return false; } if (mAsyncReadbackBytes != requiredBytes || mAsyncReadbackFormat != format || mAsyncReadbackType != type || mAsyncReadbackFramebuffer != framebuffer) { mAsyncReadbackFormat = format; mAsyncReadbackType = type; mAsyncReadbackFramebuffer = framebuffer; if (!EnsureAsyncReadbackBuffers(requiredBytes)) { finishTiming(); return false; } } AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackWriteIndex]; if (slot.inFlight) { finishTiming(); return false; } auto stageStartTime = std::chrono::steady_clock::now(); glPixelStorei(GL_PACK_ALIGNMENT, 4); glPixelStorei(GL_PACK_ROW_LENGTH, 0); glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer); auto stageEndTime = std::chrono::steady_clock::now(); timing.asyncQueueSetupMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); stageStartTime = std::chrono::steady_clock::now(); glBufferData(GL_PIXEL_PACK_BUFFER, static_cast(requiredBytes), nullptr, GL_STREAM_READ); stageEndTime = std::chrono::steady_clock::now(); timing.asyncQueueBufferMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); stageStartTime = std::chrono::steady_clock::now(); glReadPixels(0, 0, readWidth, readHeight, format, type, nullptr); stageEndTime = std::chrono::steady_clock::now(); timing.asyncQueueReadPixelsMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); stageStartTime = std::chrono::steady_clock::now(); slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); stageEndTime = std::chrono::steady_clock::now(); timing.asyncQueueFenceMilliseconds += std::chrono::duration_cast>(stageEndTime - stageStartTime).count(); slot.inFlight = slot.fence != nullptr; glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); mAsyncReadbackWriteIndex = (mAsyncReadbackWriteIndex + 1) % mAsyncReadbackSlots.size(); finishTiming(); return slot.inFlight; } bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing) { if (mAsyncReadbackBytes == 0 || outputFrame.bytes == nullptr) return false; AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackReadIndex]; if (!slot.inFlight || slot.fence == nullptr || slot.pixelPackBuffer == 0) return false; const GLenum waitFlags = timeoutNanoseconds > 0 ? GL_SYNC_FLUSH_COMMANDS_BIT : 0; const auto waitStartTime = std::chrono::steady_clock::now(); const GLenum waitResult = glClientWaitSync(slot.fence, waitFlags, timeoutNanoseconds); const auto waitEndTime = std::chrono::steady_clock::now(); timing.fenceWaitMilliseconds += std::chrono::duration_cast>(waitEndTime - waitStartTime).count(); if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED) { timing.asyncReadbackMissed = true; return false; } glDeleteSync(slot.fence); slot.fence = nullptr; glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer); const auto mapStartTime = std::chrono::steady_clock::now(); void* mappedBytes = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); const auto mapEndTime = std::chrono::steady_clock::now(); timing.mapMilliseconds += std::chrono::duration_cast>(mapEndTime - mapStartTime).count(); if (mappedBytes == nullptr) { glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); slot.inFlight = false; mAsyncReadbackReadIndex = (mAsyncReadbackReadIndex + 1) % mAsyncReadbackSlots.size(); return false; } const auto copyStartTime = std::chrono::steady_clock::now(); std::memcpy(outputFrame.bytes, mappedBytes, slot.sizeBytes); const auto copyEndTime = std::chrono::steady_clock::now(); timing.copyMilliseconds += std::chrono::duration_cast>(copyEndTime - copyStartTime).count(); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); slot.inFlight = false; mAsyncReadbackReadIndex = (mAsyncReadbackReadIndex + 1) % mAsyncReadbackSlots.size(); CacheOutputFrame(outputFrame); return true; } void OpenGLRenderPipeline::CacheOutputFrame(const VideoIOOutputFrame& outputFrame) { if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0) return; const std::size_t byteCount = static_cast(outputFrame.rowBytes) * outputFrame.height; mCachedOutputFrame.resize(byteCount); std::memcpy(mCachedOutputFrame.data(), outputFrame.bytes, byteCount); } bool OpenGLRenderPipeline::TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const { if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0) return false; const std::size_t byteCount = static_cast(outputFrame.rowBytes) * outputFrame.height; if (mCachedOutputFrame.size() != byteCount) return false; const auto copyStartTime = std::chrono::steady_clock::now(); std::memcpy(outputFrame.bytes, mCachedOutputFrame.data(), byteCount); const auto copyEndTime = std::chrono::steady_clock::now(); timing.cachedCopyMilliseconds += std::chrono::duration_cast>(copyEndTime - copyStartTime).count(); timing.cachedFallbackUsed = true; return true; } void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing) { const auto readStartTime = std::chrono::steady_clock::now(); const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10; glPixelStorei(GL_PACK_ALIGNMENT, 4); glPixelStorei(GL_PACK_ROW_LENGTH, 0); if (usePackedOutput) { glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputPackFramebuffer()); glReadPixels(0, 0, state.outputPackTextureWidth, state.outputFrameSize.height, GL_RGBA, GL_UNSIGNED_BYTE, destinationBytes); } else { glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputFramebuffer()); glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, destinationBytes); } const auto readEndTime = std::chrono::steady_clock::now(); timing.syncReadMilliseconds += std::chrono::duration_cast>(readEndTime - readStartTime).count(); timing.syncFallbackUsed = true; } OpenGLRenderPipeline::OutputReadbackTiming OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame) { OutputReadbackTiming timing; if (mOutputReadbackMode == OutputReadbackMode::Synchronous) { if (outputFrame.bytes != nullptr) { ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); CacheOutputFrame(outputFrame); } return timing; } if (mOutputReadbackMode == OutputReadbackMode::CachedOnly) { if (TryCopyCachedOutputFrame(outputFrame, timing)) return timing; if (outputFrame.bytes != nullptr) { ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); CacheOutputFrame(outputFrame); } return timing; } if (TryConsumeAsyncReadback(outputFrame, 500000, timing)) { (void)QueueAsyncReadback(state, timing); return timing; } if (TryCopyCachedOutputFrame(outputFrame, timing)) { (void)QueueAsyncReadback(state, timing); return timing; } // Bootstrap only: until the first async readback has produced cached output, // use one synchronous readback so DeckLink has a valid frame to schedule. if (outputFrame.bytes != nullptr && mCachedOutputFrame.empty()) { ReadOutputFrameSynchronously(state, outputFrame.bytes, timing); CacheOutputFrame(outputFrame); } FlushAsyncReadbackPipeline(); (void)QueueAsyncReadback(state, timing); return timing; } OpenGLRenderPipeline::OutputReadbackMode OpenGLRenderPipeline::ReadOutputReadbackModeFromEnvironment() { char* mode = nullptr; std::size_t modeSize = 0; if (_dupenv_s(&mode, &modeSize, "VST_OUTPUT_READBACK_MODE") != 0 || mode == nullptr) return OutputReadbackMode::AsyncPbo; const std::string modeValue(mode); std::free(mode); if (modeValue == "async_pbo") return OutputReadbackMode::AsyncPbo; if (modeValue == "sync") return OutputReadbackMode::Synchronous; if (modeValue == "cached_only") return OutputReadbackMode::CachedOnly; return OutputReadbackMode::AsyncPbo; }