Performance chasing
All checks were successful
CI / React UI Build (push) Successful in 10s
CI / Native Windows Build And Tests (push) Successful in 2m51s
CI / Windows Release Package (push) Successful in 2m55s

This commit is contained in:
Aiden
2026-05-11 23:10:45 +10:00
parent c5cead6003
commit a434a88108
18 changed files with 1115 additions and 82 deletions

View File

@@ -18,6 +18,7 @@ RenderEngine::RenderEngine(
mRenderPass(mRenderer),
mRenderPipeline(mRenderer, runtimeSnapshotProvider, healthTelemetry, std::move(renderEffect), std::move(screenshotReady), std::move(previewPaint)),
mShaderPrograms(mRenderer, runtimeSnapshotProvider),
mHealthTelemetry(healthTelemetry),
mHdc(hdc),
mHglrc(hglrc),
mFrameStateResolver(runtimeSnapshotProvider)
@@ -546,7 +547,11 @@ bool RenderEngine::RequestOutputFrame(const RenderPipelineFrameContext& context,
{
if (mRenderThreadRunning)
{
return TryInvokeOnRenderThread("output-render", [this, &context, &outputFrame]() {
const auto queuedAt = std::chrono::steady_clock::now();
return TryInvokeOnRenderThread("output-render", [this, &context, &outputFrame, queuedAt]() {
const auto startedAt = std::chrono::steady_clock::now();
const double queueWaitMilliseconds = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(startedAt - queuedAt).count();
mHealthTelemetry.TryRecordOutputRenderQueueWait(queueWaitMilliseconds);
mRenderCommandQueue.RequestOutputFrame({ context.videoState, context.completion });
RenderOutputFrameRequest request;
return mRenderCommandQueue.TryTakeOutputFrame(request) &&

View File

@@ -209,6 +209,7 @@ private:
OpenGLRenderPass mRenderPass;
OpenGLRenderPipeline mRenderPipeline;
OpenGLShaderPrograms mShaderPrograms;
HealthTelemetry& mHealthTelemetry;
HDC mHdc;
HGLRC mHglrc;

View File

@@ -164,6 +164,9 @@ error:
void OpenGLComposite::paintGL(bool force)
{
if (mRuntimeUpdateController)
mRuntimeUpdateController->ProcessRuntimeWork();
if (!force)
{
if (IsIconic(hGLWnd))
@@ -171,6 +174,12 @@ void OpenGLComposite::paintGL(bool force)
}
const unsigned previewFps = mRuntimeStore ? mRuntimeStore->GetConfiguredPreviewFps() : 30u;
if (!force && mVideoBackend && mVideoBackend->ShouldPrioritizeOutputOverPreview())
{
ValidateRect(hGLWnd, NULL);
return;
}
if (!mRenderEngine->TryPresentPreview(force, previewFps, mVideoBackend->OutputFrameWidth(), mVideoBackend->OutputFrameHeight()))
{
ValidateRect(hGLWnd, NULL);
@@ -261,6 +270,9 @@ bool OpenGLComposite::Start()
if (!mRenderEngine->StartRenderThread())
return false;
if (mRuntimeUpdateController)
mRuntimeUpdateController->ProcessRuntimeWork();
if (mVideoBackend->Start())
return true;
@@ -351,9 +363,6 @@ bool OpenGLComposite::RequestScreenshot(std::string& error)
void OpenGLComposite::renderEffect()
{
if (mRuntimeUpdateController)
mRuntimeUpdateController->ProcessRuntimeWork();
const RenderFrameInput frameInput = BuildRenderFrameInput();
RenderFrame(frameInput);
}

View File

@@ -8,7 +8,9 @@
#include <cstring>
#include <chrono>
#include <cstdlib>
#include <gl/gl.h>
#include <string>
OpenGLRenderPipeline::OpenGLRenderPipeline(
OpenGLRenderer& renderer,
@@ -22,7 +24,8 @@ OpenGLRenderPipeline::OpenGLRenderPipeline(
mHealthTelemetry(healthTelemetry),
mRenderEffect(renderEffect),
mOutputReady(outputReady),
mPaint(paint)
mPaint(paint),
mOutputReadbackMode(ReadOutputReadbackModeFromEnvironment())
{
}
@@ -53,9 +56,22 @@ bool OpenGLRenderPipeline::RenderFrame(const RenderPipelineFrameContext& context
mHealthTelemetry.TryRecordPerformanceStats(state.frameBudgetMilliseconds, renderMilliseconds);
mRuntimeSnapshotProvider.AdvanceFrame();
ReadOutputFrame(state, outputFrame);
if (mPaint)
mPaint();
OutputReadbackTiming readbackTiming = ReadOutputFrame(state, outputFrame);
mHealthTelemetry.TryRecordOutputRenderPipelineTiming(
renderMilliseconds,
readbackTiming.fenceWaitMilliseconds,
readbackTiming.mapMilliseconds,
readbackTiming.copyMilliseconds,
readbackTiming.cachedCopyMilliseconds,
readbackTiming.asyncQueueMilliseconds,
readbackTiming.asyncQueueBufferMilliseconds,
readbackTiming.asyncQueueSetupMilliseconds,
readbackTiming.asyncQueueReadPixelsMilliseconds,
readbackTiming.asyncQueueFenceMilliseconds,
readbackTiming.syncReadMilliseconds,
readbackTiming.asyncReadbackMissed,
readbackTiming.cachedFallbackUsed,
readbackTiming.syncFallbackUsed);
return true;
}
@@ -151,8 +167,9 @@ void OpenGLRenderPipeline::FlushAsyncReadbackPipeline()
mAsyncReadbackReadIndex = 0;
}
void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
bool OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing)
{
const auto queueStartTime = std::chrono::steady_clock::now();
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
const std::size_t requiredBytes = static_cast<std::size_t>(state.outputFrameRowBytes) * state.outputFrameSize.height;
const GLenum format = usePackedOutput ? GL_RGBA : GL_BGRA;
@@ -161,8 +178,16 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
const GLsizei readWidth = static_cast<GLsizei>(usePackedOutput ? state.outputPackTextureWidth : state.outputFrameSize.width);
const GLsizei readHeight = static_cast<GLsizei>(state.outputFrameSize.height);
const auto finishTiming = [&timing, queueStartTime]() {
const auto queueEndTime = std::chrono::steady_clock::now();
timing.asyncQueueMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(queueEndTime - queueStartTime).count();
};
if (requiredBytes == 0)
return;
{
finishTiming();
return false;
}
if (mAsyncReadbackBytes != requiredBytes
|| mAsyncReadbackFormat != format
@@ -173,30 +198,50 @@ void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
mAsyncReadbackType = type;
mAsyncReadbackFramebuffer = framebuffer;
if (!EnsureAsyncReadbackBuffers(requiredBytes))
return;
{
finishTiming();
return false;
}
}
AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackWriteIndex];
if (slot.fence != nullptr)
if (slot.inFlight)
{
glDeleteSync(slot.fence);
slot.fence = nullptr;
finishTiming();
return false;
}
auto stageStartTime = std::chrono::steady_clock::now();
glPixelStorei(GL_PACK_ALIGNMENT, 4);
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
auto stageEndTime = std::chrono::steady_clock::now();
timing.asyncQueueSetupMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
stageStartTime = std::chrono::steady_clock::now();
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(requiredBytes), nullptr, GL_STREAM_READ);
stageEndTime = std::chrono::steady_clock::now();
timing.asyncQueueBufferMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
stageStartTime = std::chrono::steady_clock::now();
glReadPixels(0, 0, readWidth, readHeight, format, type, nullptr);
stageEndTime = std::chrono::steady_clock::now();
timing.asyncQueueReadPixelsMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
stageStartTime = std::chrono::steady_clock::now();
slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
stageEndTime = std::chrono::steady_clock::now();
timing.asyncQueueFenceMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(stageEndTime - stageStartTime).count();
slot.inFlight = slot.fence != nullptr;
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
mAsyncReadbackWriteIndex = (mAsyncReadbackWriteIndex + 1) % mAsyncReadbackSlots.size();
finishTiming();
return slot.inFlight;
}
bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds)
bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing)
{
if (mAsyncReadbackBytes == 0 || outputFrame.bytes == nullptr)
return false;
@@ -206,15 +251,24 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra
return false;
const GLenum waitFlags = timeoutNanoseconds > 0 ? GL_SYNC_FLUSH_COMMANDS_BIT : 0;
const auto waitStartTime = std::chrono::steady_clock::now();
const GLenum waitResult = glClientWaitSync(slot.fence, waitFlags, timeoutNanoseconds);
const auto waitEndTime = std::chrono::steady_clock::now();
timing.fenceWaitMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(waitEndTime - waitStartTime).count();
if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED)
{
timing.asyncReadbackMissed = true;
return false;
}
glDeleteSync(slot.fence);
slot.fence = nullptr;
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
const auto mapStartTime = std::chrono::steady_clock::now();
void* mappedBytes = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
const auto mapEndTime = std::chrono::steady_clock::now();
timing.mapMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(mapEndTime - mapStartTime).count();
if (mappedBytes == nullptr)
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -223,7 +277,10 @@ bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFra
return false;
}
const auto copyStartTime = std::chrono::steady_clock::now();
std::memcpy(outputFrame.bytes, mappedBytes, slot.sizeBytes);
const auto copyEndTime = std::chrono::steady_clock::now();
timing.copyMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(copyEndTime - copyStartTime).count();
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -243,8 +300,26 @@ void OpenGLRenderPipeline::CacheOutputFrame(const VideoIOOutputFrame& outputFram
std::memcpy(mCachedOutputFrame.data(), outputFrame.bytes, byteCount);
}
void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes)
bool OpenGLRenderPipeline::TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const
{
if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0)
return false;
const std::size_t byteCount = static_cast<std::size_t>(outputFrame.rowBytes) * outputFrame.height;
if (mCachedOutputFrame.size() != byteCount)
return false;
const auto copyStartTime = std::chrono::steady_clock::now();
std::memcpy(outputFrame.bytes, mCachedOutputFrame.data(), byteCount);
const auto copyEndTime = std::chrono::steady_clock::now();
timing.cachedCopyMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(copyEndTime - copyStartTime).count();
timing.cachedFallbackUsed = true;
return true;
}
void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing)
{
const auto readStartTime = std::chrono::steady_clock::now();
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
glPixelStorei(GL_PACK_ALIGNMENT, 4);
@@ -259,24 +334,78 @@ void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& stat
glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputFramebuffer());
glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, destinationBytes);
}
const auto readEndTime = std::chrono::steady_clock::now();
timing.syncReadMilliseconds += std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(readEndTime - readStartTime).count();
timing.syncFallbackUsed = true;
}
void OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
OpenGLRenderPipeline::OutputReadbackTiming OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
{
if (TryConsumeAsyncReadback(outputFrame, 500000))
OutputReadbackTiming timing;
if (mOutputReadbackMode == OutputReadbackMode::Synchronous)
{
QueueAsyncReadback(state);
return;
if (outputFrame.bytes != nullptr)
{
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
CacheOutputFrame(outputFrame);
}
return timing;
}
// If async readback misses the playout deadline, prefer a fresh synchronous
// frame over reusing stale cached output, then restart the async pipeline.
if (outputFrame.bytes != nullptr)
if (mOutputReadbackMode == OutputReadbackMode::CachedOnly)
{
ReadOutputFrameSynchronously(state, outputFrame.bytes);
if (TryCopyCachedOutputFrame(outputFrame, timing))
return timing;
if (outputFrame.bytes != nullptr)
{
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
CacheOutputFrame(outputFrame);
}
return timing;
}
if (TryConsumeAsyncReadback(outputFrame, 500000, timing))
{
(void)QueueAsyncReadback(state, timing);
return timing;
}
if (TryCopyCachedOutputFrame(outputFrame, timing))
{
(void)QueueAsyncReadback(state, timing);
return timing;
}
// Bootstrap only: until the first async readback has produced cached output,
// use one synchronous readback so DeckLink has a valid frame to schedule.
if (outputFrame.bytes != nullptr && mCachedOutputFrame.empty())
{
ReadOutputFrameSynchronously(state, outputFrame.bytes, timing);
CacheOutputFrame(outputFrame);
}
FlushAsyncReadbackPipeline();
QueueAsyncReadback(state);
(void)QueueAsyncReadback(state, timing);
return timing;
}
OpenGLRenderPipeline::OutputReadbackMode OpenGLRenderPipeline::ReadOutputReadbackModeFromEnvironment()
{
char* mode = nullptr;
std::size_t modeSize = 0;
if (_dupenv_s(&mode, &modeSize, "VST_OUTPUT_READBACK_MODE") != 0 || mode == nullptr)
return OutputReadbackMode::AsyncPbo;
const std::string modeValue(mode);
std::free(mode);
if (modeValue == "async_pbo")
return OutputReadbackMode::AsyncPbo;
if (modeValue == "sync")
return OutputReadbackMode::Synchronous;
if (modeValue == "cached_only")
return OutputReadbackMode::CachedOnly;
return OutputReadbackMode::AsyncPbo;
}

View File

@@ -36,6 +36,13 @@ public:
bool RenderFrame(const RenderPipelineFrameContext& context, VideoIOOutputFrame& outputFrame);
private:
enum class OutputReadbackMode
{
AsyncPbo,
Synchronous,
CachedOnly
};
struct AsyncReadbackSlot
{
GLuint pixelPackBuffer = 0;
@@ -44,15 +51,34 @@ private:
bool inFlight = false;
};
struct OutputReadbackTiming
{
double fenceWaitMilliseconds = 0.0;
double mapMilliseconds = 0.0;
double copyMilliseconds = 0.0;
double cachedCopyMilliseconds = 0.0;
double asyncQueueMilliseconds = 0.0;
double asyncQueueBufferMilliseconds = 0.0;
double asyncQueueSetupMilliseconds = 0.0;
double asyncQueueReadPixelsMilliseconds = 0.0;
double asyncQueueFenceMilliseconds = 0.0;
double syncReadMilliseconds = 0.0;
bool asyncReadbackMissed = false;
bool cachedFallbackUsed = false;
bool syncFallbackUsed = false;
};
bool EnsureAsyncReadbackBuffers(std::size_t requiredBytes);
void ResetAsyncReadbackState();
void FlushAsyncReadbackPipeline();
void QueueAsyncReadback(const VideoIOState& state);
bool TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds);
bool QueueAsyncReadback(const VideoIOState& state, OutputReadbackTiming& timing);
bool TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds, OutputReadbackTiming& timing);
void CacheOutputFrame(const VideoIOOutputFrame& outputFrame);
void ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes);
bool TryCopyCachedOutputFrame(VideoIOOutputFrame& outputFrame, OutputReadbackTiming& timing) const;
void ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes, OutputReadbackTiming& timing);
void PackOutputFor10Bit(const VideoIOState& state);
void ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame);
OutputReadbackTiming ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame);
static OutputReadbackMode ReadOutputReadbackModeFromEnvironment();
OpenGLRenderer& mRenderer;
RuntimeSnapshotProvider& mRuntimeSnapshotProvider;
@@ -60,6 +86,7 @@ private:
RenderEffectCallback mRenderEffect;
OutputReadyCallback mOutputReady;
PaintCallback mPaint;
OutputReadbackMode mOutputReadbackMode = OutputReadbackMode::AsyncPbo;
std::array<AsyncReadbackSlot, 3> mAsyncReadbackSlots;
std::size_t mAsyncReadbackWriteIndex = 0;
std::size_t mAsyncReadbackReadIndex = 0;