faliure

2026-05-12 00:35:01 +10:00
parent 9e3412712c
commit bf23cd880a
5 changed files with 350 additions and 12 deletions
--- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp
+++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.cpp
@@ -421,9 +421,7 @@ void VideoBackend::OutputProducerWorkerMain()
 	for (;;)
 	{
 		{
-			std::unique_lock<std::mutex> lock(mOutputProducerMutex);
+			std::lock_guard<std::mutex> lock(mOutputProducerMutex);
 			mOutputProducerCondition.wait_for(lock, OutputProducerWakeInterval());
 			if (mOutputProducerWorkerStopping)
 			{
 				mOutputProducerWorkerRunning = false;
@@ -431,11 +429,22 @@ void VideoBackend::OutputProducerWorkerMain()
 			}
 		}
 		ScheduleReadyOutputFramesToTarget();
 		const RenderOutputQueueMetrics metrics = mReadyOutputQueue.GetMetrics();
 		RecordReadyQueueDepthSample(metrics);
 		const OutputProductionDecision decision = mOutputProductionController.Decide(BuildOutputProductionPressure(metrics));
 		if (decision.action != OutputProductionAction::Produce || decision.requestedFrames == 0)
 		{
 			std::unique_lock<std::mutex> lock(mOutputProducerMutex);
 			mOutputProducerCondition.wait_for(lock, OutputProducerWakeInterval());
 			if (mOutputProducerWorkerStopping)
 			{
 				mOutputProducerWorkerRunning = false;
 				return;
 			}
 			continue;
 		}
 		VideoIOCompletion completion;
 		{
@@ -445,15 +454,32 @@ void VideoBackend::OutputProducerWorkerMain()
 			completion = mLastOutputProductionCompletion;
 		}
 		const bool belowTargetDepth = metrics.depth < decision.targetReadyFrames;
 		const auto now = std::chrono::steady_clock::now();
-		if (mLastOutputProductionTime != std::chrono::steady_clock::time_point() &&
+		if (!belowTargetDepth &&
 			mLastOutputProductionTime != std::chrono::steady_clock::time_point() &&
 			now - mLastOutputProductionTime < OutputProducerWakeInterval())
 		{
 			continue;
 		}
-		if (ProduceReadyOutputFrames(completion, 1) > 0)
+		const std::size_t producedFrames = ProduceReadyOutputFrames(completion, decision.requestedFrames);
 		if (producedFrames > 0)
 		{
 			mLastOutputProductionTime = std::chrono::steady_clock::now();
 			ScheduleReadyOutputFramesToTarget();
 			continue;
 		}
 		{
 			std::unique_lock<std::mutex> lock(mOutputProducerMutex);
 			mOutputProducerCondition.wait_for(lock, OutputProducerWakeInterval());
 			if (mOutputProducerWorkerStopping)
 			{
 				mOutputProducerWorkerRunning = false;
 				return;
 			}
 		}
 	}
 }
@@ -487,16 +513,27 @@ void VideoBackend::ProcessOutputFrameCompletion(const VideoIOCompletion& complet
 	}
 	NotifyOutputProducer();
 	if (!ScheduleReadyOutputFrame() &&
 		(ProduceReadyOutputFrames(completion, 1) == 0 || !ScheduleReadyOutputFrame()))
 	{
 		ScheduleBlackUnderrunFrame();
 	}
 	NotifyOutputProducer();
 	RecordBackendPlayoutHealth(completion.result, recoveryDecision);
 	RecordSystemMemoryPlayoutStats();
 }
 std::size_t VideoBackend::ScheduleReadyOutputFramesToTarget()
 {
 	const std::size_t targetScheduledFrames = static_cast<std::size_t>(mPlayoutPolicy.targetPrerollFrames);
 	std::size_t scheduledFrames = 0;
 	for (;;)
 	{
 		const SystemOutputFramePoolMetrics poolMetrics = mSystemOutputFramePool.GetMetrics();
 		if (poolMetrics.scheduledCount >= targetScheduledFrames)
 			break;
 		if (!ScheduleReadyOutputFrame())
 			break;
 		++scheduledFrames;
 	}
 	return scheduledFrames;
 }
 void VideoBackend::RecordBackendPlayoutHealth(VideoIOCompletionResult result, const VideoPlayoutRecoveryDecision& recoveryDecision)
 {
 	const RenderOutputQueueMetrics queueMetrics = mReadyOutputQueue.GetMetrics();
@@ -650,6 +687,7 @@ bool VideoBackend::RenderReadyOutputFrame(const VideoIOState& state, const Video
 bool VideoBackend::ScheduleReadyOutputFrame()
 {
 	std::lock_guard<std::mutex> schedulingLock(mOutputSchedulingMutex);
 	RenderOutputFrame readyFrame;
 	if (!mReadyOutputQueue.TryPop(readyFrame))
 		return false;
--- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h
+++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoBackend.h
@@ -81,6 +81,7 @@ private:
 	std::size_t ProduceReadyOutputFrames(const VideoIOCompletion& completion, std::size_t maxFrames);
 	OutputProductionPressure BuildOutputProductionPressure(const RenderOutputQueueMetrics& metrics) const;
 	bool RenderReadyOutputFrame(const VideoIOState& state, const VideoIOCompletion& completion);
 	std::size_t ScheduleReadyOutputFramesToTarget();
 	bool ScheduleReadyOutputFrame();
 	bool ScheduleBlackUnderrunFrame();
 	void RecordFramePacing(VideoIOCompletionResult completionResult);
@@ -117,6 +118,7 @@ private:
 	VideoIOCompletion mLastOutputProductionCompletion;
 	std::chrono::steady_clock::time_point mLastOutputProductionTime;
 	std::mutex mOutputProductionMutex;
 	std::mutex mOutputSchedulingMutex;
 	mutable std::mutex mOutputMetricsMutex;
 	bool mOutputCompletionWorkerRunning = false;
 	bool mOutputCompletionWorkerStopping = false;
--- a/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h
+++ b/apps/LoopThroughWithOpenGLCompositing/videoio/VideoPlayoutPolicy.h
@@ -11,11 +11,11 @@ enum class VideoUnderrunBehavior
 struct VideoPlayoutPolicy
 {
 	unsigned outputFramePoolSize = 10;
-	unsigned targetPrerollFrames = 12;
+	unsigned targetPrerollFrames = 4;
 	unsigned targetReadyFrames = 2;
 	unsigned maxReadyFrames = 4;
 	unsigned minimumSpareDeviceFrames = 1;
-	uint64_t lateOrDropCatchUpFrames = 2;
+	uint64_t lateOrDropCatchUpFrames = 0;
 	VideoUnderrunBehavior underrunBehavior = VideoUnderrunBehavior::ReuseLastCompletedFrame;
 	bool adaptiveHeadroomEnabled = false;
 };
--- a/docs/PHASE_7_5_READBACK_EXPERIMENT_LOG.md
+++ b/docs/PHASE_7_5_READBACK_EXPERIMENT_LOG.md
@@ -213,3 +213,299 @@ Five-second delta:
 Read:
 The main readback stall appears to have been the previous format/path combination, not unavoidable BGRA8 bandwidth. The remaining problem now looks like cadence and buffering: the producer can average real-time throughput again, but the ready queue still runs empty often enough to create visible short stutters.
 ## Experiment 5: producer burst-fill ready queue
 Status: sampled
 Date: 2026-05-12
 Change:
 - The output producer now honors `OutputProductionDecision::requestedFrames` instead of always producing one frame per wake.
 - The producer no longer applies its wake-interval throttle while the ready queue is below target depth.
 - Completion fallback remains conservative; the background producer is responsible for building the cushion after immediate scheduling.
 Question:
 Now that BGRA8 readback is fast enough on average, can the producer maintain a small ready-frame cushion instead of hovering at zero?
 Expected interpretation:
 - If short stutters reduce and `readyQueue.depth` spends more time above zero, the remaining issue was producer cadence/headroom.
 - If `readyQueue.depth` still remains pinned near zero, inspect render-thread contention next: preview present, input upload, runtime-event bursts, and live-state composition.
 - If render spikes increase, burst production may be overloading the shared render thread and should be tuned with a smaller target/depth policy.
 Result:
 - User-visible playback looked about the same.
 - DeckLink reported a healthier 10-frame buffer.
 - The app ready queue now briefly reaches 1-3, but still often drains to 0.
 - No late, dropped, flushed, async-miss, or cached-fallback deltas were observed in the 8-second sample.
 - Readback remained fast.
 Representative samples:
 | readyDepth | renderMs | smoothedRenderMs | drawMs | mapMs | copyMs | asyncQueueReadPixelsMs | queueWaitMs |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
 | 0 | 4.756 | 10.135 | 0.502 | 0.186 | 0.603 | 0.088 | 0.032 |
 | 1 | 5.135 | 6.968 | 0.730 | 1.269 | 0.772 | 0.088 | 0.073 |
 | 1 | 3.578 | 6.821 | 0.702 | 1.247 | 0.618 | 0.097 | 0.103 |
 | 1 | 6.733 | 7.996 | 0.537 | 0.952 | 0.694 | 0.082 | 1.218 |
 | 0 | 5.276 | 16.782 | 0.550 | 0.119 | 0.766 | 0.090 | 0.016 |
 Eight-second delta:
 | pushed | popped | ready underruns | zero-depth samples | late delta | dropped delta | async misses | cached fallbacks | system scheduled |
 | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
 | 477 | 478 | 109 | 291 | 0 | 0 | 0 | 0 | 12 |
 Read:
 Burst filling improved device-side buffering but did not remove visible cadence issues. The remaining stutter is less likely to be raw output readback or device starvation. Next candidates are render-thread interference and pacing jitter: preview present, input upload, runtime-event/live-state bursts, and occasional completion/render spikes.
 ## Experiment 6: producer work-before-sleep pacing
 Status: ready for hardware test
 Date: 2026-05-12
 Change:
 - The output producer now checks ready-queue pressure before waiting on the producer condition variable.
 - When production is requested, the producer renders immediately instead of first sleeping for `OutputProducerWakeInterval()`.
 - The wake interval remains as the idle/no-work sleep path, not as a mandatory pre-production delay.
 Question:
 Does removing the unconditional pre-check sleep let the producer rebuild queue headroom more quickly after a shallow-queue or focus-related disturbance?
 Expected interpretation:
 - If DeckLink buffer depth is steadier and ready-queue underruns slow, the pre-production sleep was part of the cadence loss.
 - If the result is unchanged, the next likely culprit is render-thread interference rather than producer wake timing.
 - If CPU usage rises while playback does not improve, the producer may need a more explicit event/pacing model instead of tighter polling.
 ## Experiment 7: remove just-in-time render from completion path
 Status: ready for hardware test
 Date: 2026-05-12
 Change:
 - DeckLink completion processing no longer renders an output frame synchronously when the ready queue is empty.
 - Completion now schedules an already-ready frame if one exists, otherwise it uses the explicit underrun fallback and wakes the producer.
 - The producer is now solely responsible for rendering ahead and keeping the ready queue fed.
 Question:
 Does removing completion-time rendering make output cadence more stable by keeping DeckLink completion handling short and predictable?
 Expected interpretation:
 - If playback improves or completion pacing spikes shrink, just-in-time rendering in the completion path was harming cadence.
 - If underrun/fallback counts increase, the producer still is not maintaining enough ready headroom.
 - If visible output gets worse but telemetry is clearer, implement a real repeat-last-system-frame fallback instead of rendering from completion.
 ## Experiment 8: four-frame DeckLink preroll
 Status: ready for hardware test
 Date: 2026-05-12
 Change:
 - `VideoPlayoutPolicy::targetPrerollFrames` is reduced from 12 to 4.
 - The system-memory frame pool remains larger than the DeckLink preroll so the producer can still build app-side ready headroom.
 Question:
 Can a smaller DeckLink scheduled buffer stay stable now that BGRA8 readback is fast and the producer is responsible for render-ahead?
 Expected interpretation:
 - If DeckLink holds around 4 frames and playback cadence is acceptable, a large 10-12 frame device buffer is not required.
 - If focus changes or render-thread jitter drain DeckLink below 4, the next work should prioritize real device-buffer telemetry and render-thread interference.
 - If black flicker continues, it is the explicit underrun fallback being exposed by the no-JIT completion path, not a lack of DeckLink preroll alone.
 ## Experiment 9: no steady-state black fallback
 Status: sampled
 Date: 2026-05-12
 Change:
 - Normal DeckLink completion processing no longer schedules a black fallback frame when the app ready queue is empty.
 - `RenderOutputQueue::TryPop()` still records the app-ready underrun.
 - The producer is woken and the existing DeckLink scheduled buffer is allowed to carry playback.
 - The four-frame DeckLink preroll experiment remains active.
 Question:
 Was the visible black flicker caused by treating an app-ready queue miss as immediate device starvation?
 Expected interpretation:
 - If black flicker disappears while app-ready underruns still increase, the fallback was too aggressive and should stay out of the steady-state path.
 - If DeckLink buffer drains or late/dropped frames increase, we need real device-buffer telemetry and a controlled emergency policy.
 - If visible stutter remains without black, the next work is cadence attribution: preview present, input upload, render-thread priority, and actual DeckLink buffered-frame count.
 Result:
 - Playback was smooth briefly, then froze once the DeckLink buffer reached 0.
 - The buffer did not refill.
 Read:
 Removing the black fallback exposed another completion-driven assumption. The producer could render into the app ready queue, but scheduling still happened only from completion processing. Once the scheduled DeckLink buffer reached 0, completions stopped, so no later trigger scheduled the producer's ready frames.
 ## Experiment 10: producer-side scheduling
 Status: sampled
 Date: 2026-05-12
 Change:
 - The producer now schedules the frames it produces instead of waiting for a future DeckLink completion to schedule them.
 - A dedicated output scheduling mutex serializes scheduling calls from the producer and completion worker.
 - The four-frame DeckLink preroll and no steady-state black fallback experiments remain active.
 Question:
 Can the producer maintain the four-frame DeckLink buffer without relying on completion-time rendering or black fallback insertion?
 Expected interpretation:
 - If the buffer refills and playback no longer freezes, producer-side scheduling is required for a real proactive playout model.
 - If black flicker is gone but stutter remains, focus on render-thread jitter and actual device-buffer telemetry.
 - If the buffer overfills or scheduling timing becomes odd, add real DeckLink buffered-frame telemetry and schedule only up to a measured target.
 Result:
 - The DeckLink buffer stayed full.
 - Playback had a low-framerate look.
 - Over a 6-second sample, `pushedDelta` and `poppedDelta` were 310, but `underrunDelta` was also 310.
 - Late and dropped counts increased.
 - Synthetic scheduled lead grew very large, indicating producer-side scheduling was running too far ahead of the intended four-frame cushion.
 Read:
 Producer-side scheduling is required, but it must be capped by a real scheduling target. Scheduling every produced frame overfeeds the scheduler timeline and can produce odd cadence even when the DeckLink buffer appears full.
 ## Experiment 11: cap producer scheduling to preroll target
 Status: sampled
 Date: 2026-05-12
 Change:
 - The producer still renders proactively.
 - After production, it schedules ready frames only until the system-memory scheduled count reaches `VideoPlayoutPolicy::targetPrerollFrames`.
 - With the current experiment settings, that target remains four frames.
 Question:
 Can producer-side scheduling keep the four-frame buffer fed without running hundreds of frames ahead in scheduler time?
 Expected interpretation:
 - If the low-framerate look disappears and the buffer stays around four, producer scheduling needed a cap.
 - If the buffer drains, the cap needs actual DeckLink `GetBufferedVideoFrameCount()` telemetry rather than system-memory scheduled-count approximation.
 - If stutter remains with sane lead, investigate render-thread interference next.
 Result:
 - Playback still had the low-framerate look.
 - The system-memory scheduled count held at the four-frame target.
 - Synthetic scheduled lead still grew, with scheduled frame index advancing faster than completed frame index.
 Read:
 The cap was active, but completion and producer were both still scheduling ready frames. The result was still over-scheduling relative to completions, even though the system-memory scheduled count stayed at four.
 ## Experiment 12: producer owns steady-state scheduling
 Status: sampled
 Date: 2026-05-12
 Change:
 - Completion processing now releases completed frames, records telemetry, and wakes the producer only.
 - Completion no longer schedules from the ready queue during steady state.
 - Producer-side scheduling remains capped to `targetPrerollFrames`.
 Question:
 Does having a single steady-state scheduler stop the schedule timeline from running ahead and recover normal cadence?
 Expected interpretation:
 - If scheduled lead stops growing and playback cadence improves, duplicate completion/producer scheduling was the low-framerate cause.
 - If the buffer drains, the producer wake/schedule loop is still not responsive enough.
 - If lead still grows, inspect `VideoPlayoutScheduler` catch-up accounting next.
 Result:
 - Playback froze on startup.
 - Telemetry showed rendered ready frames in the app ready queue, but zero system-memory frames scheduled.
 Read:
 Removing completion-side scheduling exposed another producer-loop gap. The producer only scheduled immediately after producing frames. Once the ready queue reached its max depth, production stopped, and the already-ready frames were never handed to DeckLink.
 ## Experiment 13: producer top-up scheduling before production
 Status: pending hardware build
 Date: 2026-05-12
 Change:
 - The producer now attempts to top up DeckLink scheduling from already-ready frames before deciding whether to render more frames.
 - The producer also top-ups after successful production.
 - Completion remains release/record/wake only.
 Question:
 Can the producer own steady-state scheduling without freezing when ready frames already exist?
 Expected interpretation:
 - If startup no longer freezes and the four-frame buffer stays stable, the producer needed an explicit schedule-before-produce pass.
 - If cadence is still wrong, the next target is scheduler timeline accounting or actual DeckLink buffered-frame telemetry.
 Result:
 - Playback alternated between smooth playback and freezes.
 - The app ready queue was no longer starving; it held around 3-4 frames and had no new ready underruns in the sampled delta.
 - Late and dropped counts increased.
 - `scheduledIndexDelta` was much larger than `completedIndexDelta`, even with producer scheduling capped.
 Read:
 The proactive producer now feeds the app queue, but `VideoPlayoutScheduler` catch-up accounting still advances scheduled stream time on late/drop recovery. That creates timeline gaps and produces the smooth/freeze/smooth cadence.
 ## Experiment 14: disable late/drop catch-up skipping
 Status: pending hardware build
 Date: 2026-05-12
 Change:
 - `VideoPlayoutPolicy::lateOrDropCatchUpFrames` is set to 0.
 - Late/drop results should still be reported, but the scheduler should not advance `mScheduledFrameIndex` by extra catch-up frames.
 Question:
 Does removing schedule-time skipping stop the smooth/freeze cadence now that the producer owns steady-state scheduling?
 Expected interpretation:
 - If `scheduledIndexDelta` closely matches actual scheduled/completed frame flow and playback smooths out, catch-up skipping was harmful in proactive mode.
 - If late/dropped counts still climb without catch-up, inspect actual DeckLink buffered-frame count and render-thread interference.
--- a/docs/PHASE_7_6_SYSTEM_MEMORY_PLAYOUT_BUFFER_DESIGN.md
+++ b/docs/PHASE_7_6_SYSTEM_MEMORY_PLAYOUT_BUFFER_DESIGN.md
@@ -13,6 +13,7 @@ Implemented so far:
 - ready-queue discard paths release owned frames instead of leaking slots
 - telemetry scaffolding exposes free, ready, and scheduled system-memory frame counts
 - async PBO readback is now a deeper pipeline by default and ordinary misses no longer flush queued readbacks
 - the output producer now honors requested burst production when the ready queue is below target instead of producing only one frame per wake
 Still to verify/tune on hardware:
@@ -21,6 +22,7 @@ Still to verify/tune on hardware:
 - repeat/underrun policy behavior under real stalls
 - whether deeper async readback reduces sawtooth buffer drain
 - whether BGRA8 bandwidth is sufficient before considering v210
 - whether burst filling keeps `readyQueue.depth` above zero and reduces the remaining short stutters
 Phase 7.5 isolated the current playout timing problem around output readback and DeckLink scheduling pressure. The fast-transfer path from the DeckLink OpenGL sample is not available on the current test GPU, so the next direction is to make the normal path behave more like broadcast playout systems: render ahead, read back into system-memory frame buffers, and let DeckLink consume already-complete frames.