CPU optimisations
This commit is contained in:
@@ -4,6 +4,8 @@
|
||||
#include "RuntimeHost.h"
|
||||
#include "VideoIOFormat.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include <chrono>
|
||||
#include <gl/gl.h>
|
||||
|
||||
@@ -21,6 +23,11 @@ OpenGLRenderPipeline::OpenGLRenderPipeline(
|
||||
{
|
||||
}
|
||||
|
||||
OpenGLRenderPipeline::~OpenGLRenderPipeline()
|
||||
{
|
||||
ResetAsyncReadbackState();
|
||||
}
|
||||
|
||||
bool OpenGLRenderPipeline::RenderFrame(const RenderPipelineFrameContext& context, VideoIOOutputFrame& outputFrame)
|
||||
{
|
||||
const VideoIOState& state = context.videoState;
|
||||
@@ -62,9 +69,9 @@ void OpenGLRenderPipeline::PackOutputFor10Bit(const VideoIOState& state)
|
||||
glBindVertexArray(mRenderer.FullscreenVertexArray());
|
||||
glUseProgram(mRenderer.OutputPackProgram());
|
||||
|
||||
const GLint outputResolutionLocation = glGetUniformLocation(mRenderer.OutputPackProgram(), "uOutputVideoResolution");
|
||||
const GLint activeWordsLocation = glGetUniformLocation(mRenderer.OutputPackProgram(), "uActiveV210Words");
|
||||
const GLint packFormatLocation = glGetUniformLocation(mRenderer.OutputPackProgram(), "uOutputPackFormat");
|
||||
const GLint outputResolutionLocation = mRenderer.OutputPackResolutionLocation();
|
||||
const GLint activeWordsLocation = mRenderer.OutputPackActiveWordsLocation();
|
||||
const GLint packFormatLocation = mRenderer.OutputPackFormatLocation();
|
||||
if (outputResolutionLocation >= 0)
|
||||
glUniform2f(outputResolutionLocation, static_cast<float>(state.outputFrameSize.width), static_cast<float>(state.outputFrameSize.height));
|
||||
if (activeWordsLocation >= 0)
|
||||
@@ -78,18 +85,195 @@ void OpenGLRenderPipeline::PackOutputFor10Bit(const VideoIOState& state)
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
|
||||
bool OpenGLRenderPipeline::EnsureAsyncReadbackBuffers(std::size_t requiredBytes)
|
||||
{
|
||||
if (requiredBytes == 0)
|
||||
return false;
|
||||
|
||||
if (mAsyncReadbackBytes == requiredBytes && mAsyncReadbackSlots[0].pixelPackBuffer != 0)
|
||||
return true;
|
||||
|
||||
ResetAsyncReadbackState();
|
||||
mAsyncReadbackBytes = requiredBytes;
|
||||
for (AsyncReadbackSlot& slot : mAsyncReadbackSlots)
|
||||
{
|
||||
glGenBuffers(1, &slot.pixelPackBuffer);
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(requiredBytes), nullptr, GL_STREAM_READ);
|
||||
slot.sizeBytes = requiredBytes;
|
||||
slot.inFlight = false;
|
||||
}
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
||||
mAsyncReadbackWriteIndex = 0;
|
||||
mAsyncReadbackReadIndex = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::ResetAsyncReadbackState()
|
||||
{
|
||||
FlushAsyncReadbackPipeline();
|
||||
for (AsyncReadbackSlot& slot : mAsyncReadbackSlots)
|
||||
slot.sizeBytes = 0;
|
||||
|
||||
if (mAsyncReadbackSlots[0].pixelPackBuffer != 0)
|
||||
{
|
||||
for (AsyncReadbackSlot& slot : mAsyncReadbackSlots)
|
||||
{
|
||||
if (slot.pixelPackBuffer != 0)
|
||||
{
|
||||
glDeleteBuffers(1, &slot.pixelPackBuffer);
|
||||
slot.pixelPackBuffer = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mAsyncReadbackWriteIndex = 0;
|
||||
mAsyncReadbackReadIndex = 0;
|
||||
mAsyncReadbackBytes = 0;
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::FlushAsyncReadbackPipeline()
|
||||
{
|
||||
for (AsyncReadbackSlot& slot : mAsyncReadbackSlots)
|
||||
{
|
||||
if (slot.fence != nullptr)
|
||||
{
|
||||
glDeleteSync(slot.fence);
|
||||
slot.fence = nullptr;
|
||||
}
|
||||
slot.inFlight = false;
|
||||
}
|
||||
|
||||
mAsyncReadbackWriteIndex = 0;
|
||||
mAsyncReadbackReadIndex = 0;
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::QueueAsyncReadback(const VideoIOState& state)
|
||||
{
|
||||
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
|
||||
const std::size_t requiredBytes = static_cast<std::size_t>(state.outputFrameRowBytes) * state.outputFrameSize.height;
|
||||
const GLenum format = usePackedOutput ? GL_RGBA : GL_BGRA;
|
||||
const GLenum type = usePackedOutput ? GL_UNSIGNED_BYTE : GL_UNSIGNED_INT_8_8_8_8_REV;
|
||||
const GLuint framebuffer = usePackedOutput ? mRenderer.OutputPackFramebuffer() : mRenderer.OutputFramebuffer();
|
||||
const GLsizei readWidth = static_cast<GLsizei>(usePackedOutput ? state.outputPackTextureWidth : state.outputFrameSize.width);
|
||||
const GLsizei readHeight = static_cast<GLsizei>(state.outputFrameSize.height);
|
||||
|
||||
if (requiredBytes == 0)
|
||||
return;
|
||||
|
||||
if (mAsyncReadbackBytes != requiredBytes
|
||||
|| mAsyncReadbackFormat != format
|
||||
|| mAsyncReadbackType != type
|
||||
|| mAsyncReadbackFramebuffer != framebuffer)
|
||||
{
|
||||
mAsyncReadbackFormat = format;
|
||||
mAsyncReadbackType = type;
|
||||
mAsyncReadbackFramebuffer = framebuffer;
|
||||
if (!EnsureAsyncReadbackBuffers(requiredBytes))
|
||||
return;
|
||||
}
|
||||
|
||||
AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackWriteIndex];
|
||||
if (slot.fence != nullptr)
|
||||
{
|
||||
glDeleteSync(slot.fence);
|
||||
slot.fence = nullptr;
|
||||
}
|
||||
|
||||
glPixelStorei(GL_PACK_ALIGNMENT, 4);
|
||||
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
|
||||
if (state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10)
|
||||
glBindFramebuffer(GL_READ_FRAMEBUFFER, framebuffer);
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, static_cast<GLsizeiptr>(requiredBytes), nullptr, GL_STREAM_READ);
|
||||
glReadPixels(0, 0, readWidth, readHeight, format, type, nullptr);
|
||||
slot.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
slot.inFlight = slot.fence != nullptr;
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
||||
|
||||
mAsyncReadbackWriteIndex = (mAsyncReadbackWriteIndex + 1) % mAsyncReadbackSlots.size();
|
||||
}
|
||||
|
||||
bool OpenGLRenderPipeline::TryConsumeAsyncReadback(VideoIOOutputFrame& outputFrame, GLuint64 timeoutNanoseconds)
|
||||
{
|
||||
if (mAsyncReadbackBytes == 0 || outputFrame.bytes == nullptr)
|
||||
return false;
|
||||
|
||||
AsyncReadbackSlot& slot = mAsyncReadbackSlots[mAsyncReadbackReadIndex];
|
||||
if (!slot.inFlight || slot.fence == nullptr || slot.pixelPackBuffer == 0)
|
||||
return false;
|
||||
|
||||
const GLenum waitFlags = timeoutNanoseconds > 0 ? GL_SYNC_FLUSH_COMMANDS_BIT : 0;
|
||||
const GLenum waitResult = glClientWaitSync(slot.fence, waitFlags, timeoutNanoseconds);
|
||||
if (waitResult != GL_ALREADY_SIGNALED && waitResult != GL_CONDITION_SATISFIED)
|
||||
return false;
|
||||
|
||||
glDeleteSync(slot.fence);
|
||||
slot.fence = nullptr;
|
||||
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, slot.pixelPackBuffer);
|
||||
void* mappedBytes = glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
|
||||
if (mappedBytes == nullptr)
|
||||
{
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
||||
slot.inFlight = false;
|
||||
mAsyncReadbackReadIndex = (mAsyncReadbackReadIndex + 1) % mAsyncReadbackSlots.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
std::memcpy(outputFrame.bytes, mappedBytes, slot.sizeBytes);
|
||||
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
|
||||
|
||||
slot.inFlight = false;
|
||||
mAsyncReadbackReadIndex = (mAsyncReadbackReadIndex + 1) % mAsyncReadbackSlots.size();
|
||||
CacheOutputFrame(outputFrame);
|
||||
return true;
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::CacheOutputFrame(const VideoIOOutputFrame& outputFrame)
|
||||
{
|
||||
if (outputFrame.bytes == nullptr || outputFrame.height == 0 || outputFrame.rowBytes <= 0)
|
||||
return;
|
||||
|
||||
const std::size_t byteCount = static_cast<std::size_t>(outputFrame.rowBytes) * outputFrame.height;
|
||||
mCachedOutputFrame.resize(byteCount);
|
||||
std::memcpy(mCachedOutputFrame.data(), outputFrame.bytes, byteCount);
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::ReadOutputFrameSynchronously(const VideoIOState& state, void* destinationBytes)
|
||||
{
|
||||
const bool usePackedOutput = state.outputPixelFormat == VideoIOPixelFormat::V210 || state.outputPixelFormat == VideoIOPixelFormat::Yuva10;
|
||||
|
||||
glPixelStorei(GL_PACK_ALIGNMENT, 4);
|
||||
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
|
||||
if (usePackedOutput)
|
||||
{
|
||||
glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputPackFramebuffer());
|
||||
glReadPixels(0, 0, state.outputPackTextureWidth, state.outputFrameSize.height, GL_RGBA, GL_UNSIGNED_BYTE, outputFrame.bytes);
|
||||
glReadPixels(0, 0, state.outputPackTextureWidth, state.outputFrameSize.height, GL_RGBA, GL_UNSIGNED_BYTE, destinationBytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
glBindFramebuffer(GL_READ_FRAMEBUFFER, mRenderer.OutputFramebuffer());
|
||||
glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, outputFrame.bytes);
|
||||
glReadPixels(0, 0, state.outputFrameSize.width, state.outputFrameSize.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, destinationBytes);
|
||||
}
|
||||
}
|
||||
|
||||
void OpenGLRenderPipeline::ReadOutputFrame(const VideoIOState& state, VideoIOOutputFrame& outputFrame)
|
||||
{
|
||||
if (TryConsumeAsyncReadback(outputFrame, 500000))
|
||||
{
|
||||
QueueAsyncReadback(state);
|
||||
return;
|
||||
}
|
||||
|
||||
// If async readback misses the playout deadline, prefer a fresh synchronous
|
||||
// frame over reusing stale cached output, then restart the async pipeline.
|
||||
if (outputFrame.bytes != nullptr)
|
||||
{
|
||||
ReadOutputFrameSynchronously(state, outputFrame.bytes);
|
||||
CacheOutputFrame(outputFrame);
|
||||
}
|
||||
|
||||
FlushAsyncReadbackPipeline();
|
||||
QueueAsyncReadback(state);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user