|
|
|
|
@@ -8,7 +8,9 @@
|
|
|
|
|
#include <cstring>
|
|
|
|
|
|
|
|
|
|
#include <chrono>
|
|
|
|
|
#include <cstdlib>
|
|
|
|
|
#include <gl/gl.h>
|
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
|
|
OpenGLRenderPipeline::OpenGLRenderPipeline(
|
|
|
|
|
OpenGLRenderer& renderer,
|
|
|
|
|
@@ -22,7 +24,8 @@ OpenGLRenderPipeline::OpenGLRenderPipeline(
|
|
|
|
|
mHealthTelemetry(healthTelemetry),
|
|
|
|
|
mRenderEffect(renderEffect),
|
|
|
|
|
mOutputReady(outputReady),
|
|
|
|
|
mPaint(paint)
|
|
|
|
|
mPaint(paint),
|
|
|
|
|
mOutputReadbackMode(ReadOutputReadbackModeFromEnvironment())
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -53,9 +56,22 @@ bool OpenGLRenderPipeline::RenderFrame(const RenderPipelineFrameContext& context
|
|
|
|
|
mHealthTelemetry.TryRecordPerformanceStats(state.frameBudgetMilliseconds, renderMilliseconds);
|
|
|
|
|
mRuntimeSnapshotProvider.AdvanceFrame();
|
|
|
|
|
|
|
|
|
|
ReadOutputFrame(state, outputFrame);
|
|
|
|
|
if (mPaint)
|
|
|
|
|
mPaint();
|
|
|
|
|
OutputReadbackTiming readbackTiming = ReadOutputFrame(state, outputFrame);
|
|
|
|
|
mHealthTelemetry.TryRecordOutputRenderPipelineTiming(
|
|
|
|
|
renderMilliseconds,
|
|
|
|
|
readbackTiming.fenceWaitMilliseconds,
|
|
|
|
|
readbackTiming.mapMilliseconds,
|
|
|
|
|
readbackTiming.copyMilliseconds,
|
|
|
|
|
readbackTiming.cachedCopyMilliseconds,
|
|
|
|
|
readbackTiming.asyncQueueMilliseconds,
|
|
|
|
|
readbackTiming.asyncQueueBufferMilliseconds,
|
|
|
|
|
readbackTiming.asyncQueueSetupMilliseconds,
|
|
|
|
|
readbackTiming.asyncQueueReadPixelsMilliseconds,
|
|
|
|
|
readbackTiming.asyncQueueFenceMilliseconds,
|
|
|
|
|
readbackTiming.syncReadMilliseconds,
|
|
|
|
|
readbackTiming.asyncReadbackMissed,
|
|
|
|
|
readbackTiming.cachedFallbackUsed,
|
|
|
|
|
readbackTiming.syncFallbackUsed);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
@@ -151,8 +167,9 @@ void OpenGLRenderPipeline::FlushAsyncReadbackPipeline()
|
|
|
|
|
mAsyncReadbackReadIndex = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
|
|
|
|
|
bool OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing)
|
|
|
|
|
{
|
|
|
|
|
const auto queueStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
|
|
|
|
|
const std::size_t requiredBytes = static_cast<std::size_t>(state.outputFrameRowBytes) * state.outputFrameSize.height;
|
|
|
|
|
const GLenum format = usePackedOutput ? GL_RGBA : GL_BGRA;
|
|
|
|
|
@@ -161,8 +178,16 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
|
|
|
|
|
const GLsizei readWidth = static_cast<GLsizei>(usePackedOutput ? state.outputPackTextureWidth : state.outputFrameSize.width);
|
|
|
|
|
const GLsizei readHeight = static_cast<GLsizei>(state.outputFrameSize.height);
|
|
|
|
|
|
|
|
|
|
const auto finishTiming = [&timing, queueStartTime]() {
|
|
|
|
|
const auto queueEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.asyncQueueMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(queueEndTime - queueStartTime).count();
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (requiredBytes == 0)
|
|
|
|
|
return;
|
|
|
|
|
{
|
|
|
|
|
finishTiming();
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (mAsyncReadbackBytes != requiredBytes
|
|
|
|
|
|| mAsyncReadbackFormat != format
|
|
|
|
|
@@ -173,30 +198,50 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
|
|
|
|
|
mAsyncReadbackType = type;
|
|
|
|
|
mAsyncReadbackFramebuffer = framebuffer;
|
|
|
|
|
if (!EnsureAsyncReadbackBuffers(requiredBytes))
|
|
|
|
|
return;
|
|
|
|
|
{
|
|
|
|
|
finishTiming();
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackWriteIndex];
|
|
|
|
|
if (slot.fence != nullptr)
|
|
|
|
|
if (slot.inFlight)
|
|
|
|
|
{
|
|
|
|
|
glDeleteSync(slot.fence);
|
|
|
|
|
slot.fence = nullptr;
|
|
|
|
|
finishTiming();
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto stageStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
glPixelStorei(GL_PACK_ALIGNMENT, 4);
|
|
|
|
|
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
|
|
|
|
|
glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer);
|
|
|
|
|
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
|
|
|
|
|
auto stageEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.asyncQueueSetupMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
|
|
|
|
|
|
|
|
|
|
stageStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(requiredBytes), nullptr, GL_STREAM_READ);
|
|
|
|
|
stageEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.asyncQueueBufferMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
|
|
|
|
|
|
|
|
|
|
stageStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
glReadPixels(0, 0, readWidth, readHeight, format, type, nullptr);
|
|
|
|
|
stageEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.asyncQueueReadPixelsMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
|
|
|
|
|
|
|
|
|
|
stageStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
|
|
|
|
stageEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.asyncQueueFenceMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
|
|
|
|
|
slot.inFlight = slot.fence != nullptr;
|
|
|
|
|
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
|
|
|
|
|
|
|
|
|
mAsyncReadbackWriteIndex = (mAsyncReadbackWriteIndex + 1) % mAsyncReadbackSlots.size();
|
|
|
|
|
finishTiming();
|
|
|
|
|
return slot.inFlight;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds)
|
|
|
|
|
bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing)
|
|
|
|
|
{
|
|
|
|
|
if (mAsyncReadbackBytes == 0 || outputFrame.bytes == nullptr)
|
|
|
|
|
return false;
|
|
|
|
|
@@ -206,15 +251,24 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const GLenum waitFlags = timeoutNanoseconds > 0 ? GL_SYNC_FLUSH_COMMANDS_BIT : 0;
|
|
|
|
|
const auto waitStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
const GLenum waitResult = glClientWaitSync(slot.fence, waitFlags, timeoutNanoseconds);
|
|
|
|
|
const auto waitEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.fenceWaitMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(waitEndTime - waitStartTime).count();
|
|
|
|
|
if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED)
|
|
|
|
|
{
|
|
|
|
|
timing.asyncReadbackMissed = true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
glDeleteSync(slot.fence);
|
|
|
|
|
slot.fence = nullptr;
|
|
|
|
|
|
|
|
|
|
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
|
|
|
|
|
const auto mapStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
void* mappedBytes = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
|
|
|
|
|
const auto mapEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.mapMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(mapEndTime - mapStartTime).count();
|
|
|
|
|
if (mappedBytes == nullptr)
|
|
|
|
|
{
|
|
|
|
|
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
|
|
|
|
@@ -223,7 +277,10 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const auto copyStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
std::memcpy(outputFrame.bytes, mappedBytes, slot.sizeBytes);
|
|
|
|
|
const auto copyEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.copyMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(copyEndTime - copyStartTime).count();
|
|
|
|
|
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
|
|
|
|
|
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
|
|
|
|
|
|
|
|
|
@@ -243,8 +300,26 @@ void OpenGLRenderPipeline::CacheOutputFrame(const VideoIOOutputFrame& outputFram
|
|
|
|
|
std::memcpy(mCachedOutputFrame.data(), outputFrame.bytes, byteCount);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes)
|
|
|
|
|
bool OpenGLRenderPipeline::TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const
|
|
|
|
|
{
|
|
|
|
|
if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const std::size_t byteCount = static_cast<std::size_t>(outputFrame.rowBytes) * outputFrame.height;
|
|
|
|
|
if (mCachedOutputFrame.size() != byteCount)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const auto copyStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
std::memcpy(outputFrame.bytes, mCachedOutputFrame.data(), byteCount);
|
|
|
|
|
const auto copyEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.cachedCopyMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(copyEndTime - copyStartTime).count();
|
|
|
|
|
timing.cachedFallbackUsed = true;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing)
|
|
|
|
|
{
|
|
|
|
|
const auto readStartTime = std::chrono::steady_clock::now();
|
|
|
|
|
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
|
|
|
|
|
|
|
|
|
|
glPixelStorei(GL_PACK_ALIGNMENT, 4);
|
|
|
|
|
@@ -259,24 +334,78 @@ void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& stat
|
|
|
|
|
glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputFramebuffer());
|
|
|
|
|
glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, destinationBytes);
|
|
|
|
|
}
|
|
|
|
|
const auto readEndTime = std::chrono::steady_clock::now();
|
|
|
|
|
timing.syncReadMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(readEndTime - readStartTime).count();
|
|
|
|
|
timing.syncFallbackUsed = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
|
|
|
|
|
OpenGLRenderPipeline::OutputReadbackTiming OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
|
|
|
|
|
{
|
|
|
|
|
if (TryConsumeAsyncReadback(outputFrame, 500000))
|
|
|
|
|
OutputReadbackTiming timing;
|
|
|
|
|
|
|
|
|
|
if (mOutputReadbackMode == OutputReadbackMode::Synchronous)
|
|
|
|
|
{
|
|
|
|
|
QueueAsyncReadback(state);
|
|
|
|
|
return;
|
|
|
|
|
if (outputFrame.bytes != nullptr)
|
|
|
|
|
{
|
|
|
|
|
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
|
|
|
|
|
CacheOutputFrame(outputFrame);
|
|
|
|
|
}
|
|
|
|
|
return timing;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If async readback misses the playout deadline, prefer a fresh synchronous
|
|
|
|
|
// frame over reusing stale cached output, then restart the async pipeline.
|
|
|
|
|
if (outputFrame.bytes != nullptr)
|
|
|
|
|
if (mOutputReadbackMode == OutputReadbackMode::CachedOnly)
|
|
|
|
|
{
|
|
|
|
|
ReadOutputFrameSynchronously(state, outputFrame.bytes);
|
|
|
|
|
if (TryCopyCachedOutputFrame(outputFrame, timing))
|
|
|
|
|
return timing;
|
|
|
|
|
|
|
|
|
|
if (outputFrame.bytes != nullptr)
|
|
|
|
|
{
|
|
|
|
|
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
|
|
|
|
|
CacheOutputFrame(outputFrame);
|
|
|
|
|
}
|
|
|
|
|
return timing;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (TryConsumeAsyncReadback(outputFrame, 500000, timing))
|
|
|
|
|
{
|
|
|
|
|
(void)QueueAsyncReadback(state, timing);
|
|
|
|
|
return timing;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (TryCopyCachedOutputFrame(outputFrame, timing))
|
|
|
|
|
{
|
|
|
|
|
(void)QueueAsyncReadback(state, timing);
|
|
|
|
|
return timing;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Bootstrap only: until the first async readback has produced cached output,
|
|
|
|
|
// use one synchronous readback so DeckLink has a valid frame to schedule.
|
|
|
|
|
if (outputFrame.bytes != nullptr && mCachedOutputFrame.empty())
|
|
|
|
|
{
|
|
|
|
|
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
|
|
|
|
|
CacheOutputFrame(outputFrame);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
FlushAsyncReadbackPipeline();
|
|
|
|
|
QueueAsyncReadback(state);
|
|
|
|
|
(void)QueueAsyncReadback(state, timing);
|
|
|
|
|
return timing;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OpenGLRenderPipeline::OutputReadbackMode OpenGLRenderPipeline::ReadOutputReadbackModeFromEnvironment()
|
|
|
|
|
{
|
|
|
|
|
char* mode = nullptr;
|
|
|
|
|
std::size_t modeSize = 0;
|
|
|
|
|
if (_dupenv_s(&mode, &modeSize, "VST_OUTPUT_READBACK_MODE") != 0 || mode == nullptr)
|
|
|
|
|
return OutputReadbackMode::AsyncPbo;
|
|
|
|
|
|
|
|
|
|
const std::string modeValue(mode);
|
|
|
|
|
std::free(mode);
|
|
|
|
|
if (modeValue == "async_pbo")
|
|
|
|
|
return OutputReadbackMode::AsyncPbo;
|
|
|
|
|
if (modeValue == "sync")
|
|
|
|
|
return OutputReadbackMode::Synchronous;
|
|
|
|
|
if (modeValue == "cached_only")
|
|
|
|
|
return OutputReadbackMode::CachedOnly;
|
|
|
|
|
|
|
|
|
|
return OutputReadbackMode::AsyncPbo;
|
|
|
|
|
}
|
|
|
|
|
|