2015-05-24 06:55:12 +02:00
|
|
|
// Copyright 2008 Dolphin Emulator Project
|
2021-07-05 03:22:19 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2008-12-08 06:30:24 +01:00
|
|
|
|
2017-09-02 19:23:26 +02:00
|
|
|
#include "VideoCommon/Fifo.h"
|
|
|
|
|
2015-05-27 06:42:56 +02:00
|
|
|
#include <atomic>
|
2016-01-17 22:54:31 +01:00
|
|
|
#include <cstring>
|
2015-05-27 06:42:56 +02:00
|
|
|
|
2016-01-17 22:54:31 +01:00
|
|
|
#include "Common/Assert.h"
|
2015-05-27 20:53:09 +02:00
|
|
|
#include "Common/BlockingLoop.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
#include "Common/ChunkFile.h"
|
2015-03-05 17:12:24 +01:00
|
|
|
#include "Common/Event.h"
|
2014-02-20 04:11:52 +01:00
|
|
|
#include "Common/FPURoundMode.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
#include "Common/MemoryUtil.h"
|
2016-01-17 22:54:31 +01:00
|
|
|
#include "Common/MsgHandler.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
|
2022-01-06 01:08:56 +01:00
|
|
|
#include "Core/Config/MainSettings.h"
|
2014-09-09 06:24:49 +02:00
|
|
|
#include "Core/ConfigManager.h"
|
2016-01-19 00:08:18 +01:00
|
|
|
#include "Core/CoreTiming.h"
|
2022-02-13 01:28:24 +01:00
|
|
|
#include "Core/HW/GPFifo.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
#include "Core/HW/Memmap.h"
|
2016-11-10 17:55:21 +01:00
|
|
|
#include "Core/Host.h"
|
2022-01-07 03:50:18 +01:00
|
|
|
#include "Core/System.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
|
2015-01-31 11:38:23 +01:00
|
|
|
#include "VideoCommon/AsyncRequests.h"
|
2014-09-06 23:26:40 +02:00
|
|
|
#include "VideoCommon/CPMemory.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
#include "VideoCommon/CommandProcessor.h"
|
2014-07-08 16:49:33 +02:00
|
|
|
#include "VideoCommon/DataReader.h"
|
2022-09-26 23:41:56 +02:00
|
|
|
#include "VideoCommon/FramebufferManager.h"
|
2014-02-17 11:18:15 +01:00
|
|
|
#include "VideoCommon/OpcodeDecoding.h"
|
2014-09-06 23:26:40 +02:00
|
|
|
#include "VideoCommon/VertexLoaderManager.h"
|
2015-05-27 20:53:09 +02:00
|
|
|
#include "VideoCommon/VertexManagerBase.h"
|
2017-09-02 19:23:26 +02:00
|
|
|
#include "VideoCommon/VideoBackendBase.h"
|
2008-12-08 06:30:24 +01:00
|
|
|
|
2016-01-12 22:44:58 +01:00
|
|
|
namespace Fifo
|
|
|
|
{
|
2016-01-10 12:18:45 +01:00
|
|
|
static constexpr int GPU_TIME_SLOT_SIZE = 1000;
|
2015-12-25 22:07:00 +01:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
FifoManager::FifoManager(Core::System& system) : m_system{system}
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
FifoManager::~FifoManager() = default;
|
|
|
|
|
|
|
|
void FifoManager::RefreshConfig()
|
2022-01-07 06:41:41 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_config_sync_gpu = Config::Get(Config::MAIN_SYNC_GPU);
|
|
|
|
m_config_sync_gpu_max_distance = Config::Get(Config::MAIN_SYNC_GPU_MAX_DISTANCE);
|
|
|
|
m_config_sync_gpu_min_distance = Config::Get(Config::MAIN_SYNC_GPU_MIN_DISTANCE);
|
|
|
|
m_config_sync_gpu_overclock = Config::Get(Config::MAIN_SYNC_GPU_OVERCLOCK);
|
2022-01-07 06:41:41 +01:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::DoState(PointerWrap& p)
|
2008-10-04 00:05:28 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
p.DoArray(m_video_buffer, FIFO_SIZE);
|
|
|
|
u8* write_ptr = m_video_buffer_write_ptr;
|
|
|
|
p.DoPointer(write_ptr, m_video_buffer);
|
|
|
|
m_video_buffer_write_ptr = write_ptr;
|
|
|
|
p.DoPointer(m_video_buffer_read_ptr, m_video_buffer);
|
|
|
|
if (p.IsReadMode() && m_use_deterministic_gpu_thread)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
|
|
|
// We're good and paused, right?
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_seen_ptr = m_video_buffer_pp_read_ptr = m_video_buffer_read_ptr;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
p.Do(m_sync_ticks);
|
|
|
|
p.Do(m_syncing_suspended);
|
2011-12-31 05:16:12 +01:00
|
|
|
}
|
2009-07-03 12:00:09 +02:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::PauseAndLock(bool do_lock, bool unpause_on_unlock)
|
2011-12-31 05:16:12 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
if (do_lock)
|
2011-12-31 05:16:12 +01:00
|
|
|
{
|
2016-08-19 04:35:58 +02:00
|
|
|
SyncGPU(SyncGPUReason::Other);
|
2011-12-31 05:16:12 +01:00
|
|
|
EmulatorState(false);
|
2016-11-10 17:55:21 +01:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
if (!m_system.IsDualCoreMode() || m_use_deterministic_gpu_thread)
|
2016-11-10 17:55:21 +01:00
|
|
|
return;
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.WaitYield(std::chrono::milliseconds(100), Host_YieldToUI);
|
2011-12-31 05:16:12 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
if (unpause_on_unlock)
|
2011-12-31 05:16:12 +01:00
|
|
|
EmulatorState(true);
|
|
|
|
}
|
2008-08-28 11:19:46 +02:00
|
|
|
}
|
2008-12-08 06:30:24 +01:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::Init()
|
2011-02-13 05:10:40 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (!m_config_callback_id)
|
|
|
|
m_config_callback_id = Config::AddConfigChangedCallback([this] { RefreshConfig(); });
|
2022-01-07 06:41:41 +01:00
|
|
|
RefreshConfig();
|
|
|
|
|
2014-12-09 03:49:57 +01:00
|
|
|
// Padded so that SIMD overreads in the vertex loader are safe
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer = static_cast<u8*>(Common::AllocateMemoryPages(FIFO_SIZE + 4));
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
ResetVideoBuffer();
|
2023-12-19 03:31:32 +01:00
|
|
|
if (m_system.IsDualCoreMode())
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Prepare();
|
|
|
|
m_sync_ticks.store(0);
|
2008-07-12 19:40:22 +02:00
|
|
|
}
|
2008-12-08 06:30:24 +01:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::Shutdown()
|
2008-07-12 19:40:22 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_gpu_mainloop.IsRunning())
|
2020-11-14 04:33:26 +01:00
|
|
|
PanicAlertFmt("FIFO shutting down while active");
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
Common::FreeMemoryPages(m_video_buffer, FIFO_SIZE + 4);
|
|
|
|
m_video_buffer = nullptr;
|
|
|
|
m_video_buffer_write_ptr = nullptr;
|
|
|
|
m_video_buffer_pp_read_ptr = nullptr;
|
|
|
|
m_video_buffer_read_ptr = nullptr;
|
|
|
|
m_video_buffer_seen_ptr = nullptr;
|
|
|
|
m_fifo_aux_write_ptr = nullptr;
|
|
|
|
m_fifo_aux_read_ptr = nullptr;
|
2022-01-07 06:41:41 +01:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_config_callback_id)
|
2022-01-07 06:41:41 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
Config::RemoveConfigChangedCallback(*m_config_callback_id);
|
|
|
|
m_config_callback_id = std::nullopt;
|
2022-01-07 06:41:41 +01:00
|
|
|
}
|
2008-07-12 19:40:22 +02:00
|
|
|
}
|
2008-12-08 06:30:24 +01:00
|
|
|
|
2009-03-08 00:34:16 +01:00
|
|
|
// May be executed from any thread, even the graphics thread.
|
|
|
|
// Created to allow for self shutdown.
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::ExitGpuLoop()
|
2009-08-09 13:03:58 +02:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
auto& command_processor = m_system.GetCommandProcessor();
|
2022-11-27 13:50:50 +01:00
|
|
|
auto& fifo = command_processor.GetFifo();
|
|
|
|
|
2010-01-16 23:37:38 +01:00
|
|
|
// This should break the wait loop in CPU thread
|
2022-11-27 13:50:50 +01:00
|
|
|
fifo.bFF_GPReadEnable.store(0, std::memory_order_relaxed);
|
2023-12-19 03:31:32 +01:00
|
|
|
FlushGpu();
|
2015-03-05 21:14:46 +01:00
|
|
|
|
2010-01-16 23:37:38 +01:00
|
|
|
// Terminate GPU thread loop
|
2022-12-09 20:01:25 +01:00
|
|
|
m_emu_running_state.Set();
|
2023-06-05 05:25:20 +02:00
|
|
|
m_gpu_mainloop.Stop(Common::BlockingLoop::StopMode::NonBlock);
|
2010-01-03 17:04:40 +01:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::EmulatorState(bool running)
|
2010-01-03 17:04:40 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_emu_running_state.Set(running);
|
2016-07-15 02:20:04 +02:00
|
|
|
if (running)
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Wakeup();
|
2016-07-15 02:20:04 +02:00
|
|
|
else
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.AllowSleep();
|
2009-03-08 00:34:16 +01:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::SyncGPU(SyncGPUReason reason, bool may_move_read_ptr)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_use_deterministic_gpu_thread)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Wait();
|
|
|
|
if (!m_gpu_mainloop.IsRunning())
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
return;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// Opportunistically reset FIFOs so we don't wrap around.
|
2022-12-09 20:01:25 +01:00
|
|
|
if (may_move_read_ptr && m_fifo_aux_write_ptr != m_fifo_aux_read_ptr)
|
2020-11-14 04:33:26 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
PanicAlertFmt("Aux FIFO not synced ({}, {})", fmt::ptr(m_fifo_aux_write_ptr),
|
|
|
|
fmt::ptr(m_fifo_aux_read_ptr));
|
2020-11-14 04:33:26 +01:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
memmove(m_fifo_aux_data, m_fifo_aux_read_ptr, m_fifo_aux_write_ptr - m_fifo_aux_read_ptr);
|
|
|
|
m_fifo_aux_write_ptr -= (m_fifo_aux_read_ptr - m_fifo_aux_data);
|
|
|
|
m_fifo_aux_read_ptr = m_fifo_aux_data;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
if (may_move_read_ptr)
|
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
u8* write_ptr = m_video_buffer_write_ptr;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// what's left over in the buffer
|
2022-12-09 20:01:25 +01:00
|
|
|
size_t size = write_ptr - m_video_buffer_pp_read_ptr;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
memmove(m_video_buffer, m_video_buffer_pp_read_ptr, size);
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// This change always decreases the pointers. We write seen_ptr
|
|
|
|
// after write_ptr here, and read it before in RunGpuLoop, so
|
|
|
|
// 'write_ptr > seen_ptr' there cannot become spuriously true.
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_write_ptr = write_ptr = m_video_buffer + size;
|
|
|
|
m_video_buffer_pp_read_ptr = m_video_buffer;
|
|
|
|
m_video_buffer_read_ptr = m_video_buffer;
|
|
|
|
m_video_buffer_seen_ptr = write_ptr;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::PushFifoAuxBuffer(const void* ptr, size_t size)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (size > (size_t)(m_fifo_aux_data + FIFO_SIZE - m_fifo_aux_write_ptr))
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2016-08-19 04:35:58 +02:00
|
|
|
SyncGPU(SyncGPUReason::AuxSpace, /* may_move_read_ptr */ false);
|
2022-12-09 20:01:25 +01:00
|
|
|
if (!m_gpu_mainloop.IsRunning())
|
Exit ReadDataFromFifoOnCPU, PushFifoAuxBuffer early if shutting down (GpuRunningState=false)
This was causing a race condition where the "absurdly large aux buffer"
panic alert would be triggered in the last bit of fifo processing on the
CPU thread in deterministic mode (i.e. netplay). SyncGPU is supposed to
move the auxiliary queue data to the beginning of the containing buffer
so we don't have to deal with wraparound; if GpuRunningState is false,
however, it just returns, because it's set to false by another thread -
thus it doesn't know whether RunGpuLoop is still executing (in which
case it can't just reset the pointers, because it may still be using the
buffer) or not (in which case the condition variable it normally waits
for to avoid the previous problem will never be signaled). However,
SyncGPU's caller PushFifoAuxBuffer wasn't aware of this, so if the
buffer was filling at just the right time, it'd stay full and that
function would complain that it was about to overflow it. Similar
problem with ReadDataFromFifoOnCPU afaik. Fix this by returning early
from those as well; other callers of SyncGPU should be safe. A
*slightly* cleaner alternative would be giving the CPU thread a way to
tell when RunGpuLoop has actually exited, but whatever, this works.
2015-04-21 20:46:44 +02:00
|
|
|
{
|
|
|
|
// GPU is shutting down
|
|
|
|
return;
|
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
if (size > (size_t)(m_fifo_aux_data + FIFO_SIZE - m_fifo_aux_write_ptr))
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
|
|
|
// That will sync us up to the last 32 bytes, so this short region
|
|
|
|
// of FIFO would have to point to a 2MB display list or something.
|
2020-11-14 04:33:26 +01:00
|
|
|
PanicAlertFmt("Absurdly large aux buffer");
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
memcpy(m_fifo_aux_write_ptr, ptr, size);
|
|
|
|
m_fifo_aux_write_ptr += size;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void* FifoManager::PopFifoAuxBuffer(size_t size)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
void* ret = m_fifo_aux_read_ptr;
|
|
|
|
m_fifo_aux_read_ptr += size;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2011-02-10 05:47:02 +01:00
|
|
|
|
Big Fifo Commit Part2: Now the fifo is more stable than my first commit, so is time...
- ReImplementing Single Core Mode like Dual Core Mode Style.
- Stage 1: My goal is, we have the Fifo, CommandProccessor code the more clear, maintenible and documented possible. When I quit dolphin I want any developer can continue with the work only reading the code.
* Big Refactoring: A lot of functions was changed the names, and modularized.
Now the FifoLoop and CatchUpGPU does not exist, was replaced by RunGpu() and RunGpuLoop().
The general idea is modeling the code like the real HW. The fifo is only a buffer where the Write Gather Pipe write the commands and from the Graphic Processor read these.
* Big Clean UP a lot of obsolete code and comments was deleted, like DcFakeWachDog, "Fifo very soon hack", etc.
In the stage 2, I will refactoring more code doing emphasis in the division of CommandProcessor, Fifo, Gpu Emulation. Beside I will comment all functions and variables in the code (Don't worry I will ask for English help for this part ;) )
Please test a lot SC mode and DC mode :)
Thank you so much for testing always and the patience. I don't like broke your favorite game but... you must believe me this part is very sensible, I only try to contribute for have a better and stable dolphin emulator.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7185 8ced0084-cf51-0410-be5f-012b33b47a6e
2011-02-17 05:25:21 +01:00
|
|
|
// Description: RunGpuLoop() sends data through this function.
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::ReadDataFromFifo(u32 read_ptr)
|
2009-06-15 06:30:02 +02:00
|
|
|
{
|
2022-02-13 01:28:24 +01:00
|
|
|
if (GPFifo::GATHER_PIPE_SIZE >
|
2022-12-09 20:01:25 +01:00
|
|
|
static_cast<size_t>(m_video_buffer + FIFO_SIZE - m_video_buffer_write_ptr))
|
2009-06-15 06:30:02 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
const size_t existing_len = m_video_buffer_write_ptr - m_video_buffer_read_ptr;
|
2022-02-13 01:28:24 +01:00
|
|
|
if (GPFifo::GATHER_PIPE_SIZE > static_cast<size_t>(FIFO_SIZE - existing_len))
|
2009-06-15 06:30:02 +02:00
|
|
|
{
|
2022-02-13 01:28:24 +01:00
|
|
|
PanicAlertFmt("FIFO out of bounds (existing {} + new {} > {})", existing_len,
|
|
|
|
GPFifo::GATHER_PIPE_SIZE, FIFO_SIZE);
|
2014-08-26 19:37:32 +02:00
|
|
|
return;
|
2009-06-15 06:30:02 +02:00
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
memmove(m_video_buffer, m_video_buffer_read_ptr, existing_len);
|
|
|
|
m_video_buffer_write_ptr = m_video_buffer + existing_len;
|
|
|
|
m_video_buffer_read_ptr = m_video_buffer;
|
2009-06-15 06:30:02 +02:00
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
// Copy new video instructions to m_video_buffer for future use in rendering the new picture
|
2023-12-19 03:31:32 +01:00
|
|
|
auto& memory = m_system.GetMemory();
|
|
|
|
memory.CopyFromEmu(m_video_buffer_write_ptr, read_ptr, GPFifo::GATHER_PIPE_SIZE);
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_write_ptr += GPFifo::GATHER_PIPE_SIZE;
|
2009-06-15 06:30:02 +02:00
|
|
|
}
|
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// The deterministic_gpu_thread version.
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::ReadDataFromFifoOnCPU(u32 read_ptr)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
u8* write_ptr = m_video_buffer_write_ptr;
|
|
|
|
if (GPFifo::GATHER_PIPE_SIZE > static_cast<size_t>(m_video_buffer + FIFO_SIZE - write_ptr))
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
|
|
|
// We can't wrap around while the GPU is working on the data.
|
|
|
|
// This should be very rare due to the reset in SyncGPU.
|
2016-08-19 04:35:58 +02:00
|
|
|
SyncGPU(SyncGPUReason::Wraparound);
|
2022-12-09 20:01:25 +01:00
|
|
|
if (!m_gpu_mainloop.IsRunning())
|
Exit ReadDataFromFifoOnCPU, PushFifoAuxBuffer early if shutting down (GpuRunningState=false)
This was causing a race condition where the "absurdly large aux buffer"
panic alert would be triggered in the last bit of fifo processing on the
CPU thread in deterministic mode (i.e. netplay). SyncGPU is supposed to
move the auxiliary queue data to the beginning of the containing buffer
so we don't have to deal with wraparound; if GpuRunningState is false,
however, it just returns, because it's set to false by another thread -
thus it doesn't know whether RunGpuLoop is still executing (in which
case it can't just reset the pointers, because it may still be using the
buffer) or not (in which case the condition variable it normally waits
for to avoid the previous problem will never be signaled). However,
SyncGPU's caller PushFifoAuxBuffer wasn't aware of this, so if the
buffer was filling at just the right time, it'd stay full and that
function would complain that it was about to overflow it. Similar
problem with ReadDataFromFifoOnCPU afaik. Fix this by returning early
from those as well; other callers of SyncGPU should be safe. A
*slightly* cleaner alternative would be giving the CPU thread a way to
tell when RunGpuLoop has actually exited, but whatever, this works.
2015-04-21 20:46:44 +02:00
|
|
|
{
|
2015-05-27 20:53:09 +02:00
|
|
|
// GPU is shutting down, so the next asserts may fail
|
Exit ReadDataFromFifoOnCPU, PushFifoAuxBuffer early if shutting down (GpuRunningState=false)
This was causing a race condition where the "absurdly large aux buffer"
panic alert would be triggered in the last bit of fifo processing on the
CPU thread in deterministic mode (i.e. netplay). SyncGPU is supposed to
move the auxiliary queue data to the beginning of the containing buffer
so we don't have to deal with wraparound; if GpuRunningState is false,
however, it just returns, because it's set to false by another thread -
thus it doesn't know whether RunGpuLoop is still executing (in which
case it can't just reset the pointers, because it may still be using the
buffer) or not (in which case the condition variable it normally waits
for to avoid the previous problem will never be signaled). However,
SyncGPU's caller PushFifoAuxBuffer wasn't aware of this, so if the
buffer was filling at just the right time, it'd stay full and that
function would complain that it was about to overflow it. Similar
problem with ReadDataFromFifoOnCPU afaik. Fix this by returning early
from those as well; other callers of SyncGPU should be safe. A
*slightly* cleaner alternative would be giving the CPU thread a way to
tell when RunGpuLoop has actually exited, but whatever, this works.
2015-04-21 20:46:44 +02:00
|
|
|
return;
|
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_video_buffer_pp_read_ptr != m_video_buffer_read_ptr)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2020-11-14 04:33:26 +01:00
|
|
|
PanicAlertFmt("Desynced read pointers");
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
return;
|
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
write_ptr = m_video_buffer_write_ptr;
|
|
|
|
const size_t existing_len = write_ptr - m_video_buffer_pp_read_ptr;
|
2022-02-13 01:28:24 +01:00
|
|
|
if (GPFifo::GATHER_PIPE_SIZE > static_cast<size_t>(FIFO_SIZE - existing_len))
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
2022-02-13 01:28:24 +01:00
|
|
|
PanicAlertFmt("FIFO out of bounds (existing {} + new {} > {})", existing_len,
|
|
|
|
GPFifo::GATHER_PIPE_SIZE, FIFO_SIZE);
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
return;
|
2016-06-24 10:43:46 +02:00
|
|
|
}
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
2023-12-19 03:31:32 +01:00
|
|
|
auto& memory = m_system.GetMemory();
|
|
|
|
memory.CopyFromEmu(m_video_buffer_write_ptr, read_ptr, GPFifo::GATHER_PIPE_SIZE);
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_pp_read_ptr = OpcodeDecoder::RunFifo<true>(
|
|
|
|
DataReader(m_video_buffer_pp_read_ptr, write_ptr + GPFifo::GATHER_PIPE_SIZE), nullptr);
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// This would have to be locked if the GPU thread didn't spin.
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_write_ptr = write_ptr + GPFifo::GATHER_PIPE_SIZE;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::ResetVideoBuffer()
|
2011-01-29 22:26:46 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_read_ptr = m_video_buffer;
|
|
|
|
m_video_buffer_write_ptr = m_video_buffer;
|
|
|
|
m_video_buffer_seen_ptr = m_video_buffer;
|
|
|
|
m_video_buffer_pp_read_ptr = m_video_buffer;
|
|
|
|
m_fifo_aux_write_ptr = m_fifo_aux_data;
|
|
|
|
m_fifo_aux_read_ptr = m_fifo_aux_data;
|
2011-01-29 22:26:46 +01:00
|
|
|
}
|
|
|
|
|
2010-01-23 22:06:12 +01:00
|
|
|
// Description: Main FIFO update loop
|
|
|
|
// Purpose: Keep the Core HW updated about the CPU-GPU distance
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::RunGpuLoop()
|
2008-07-12 19:40:22 +02:00
|
|
|
{
|
2015-01-31 11:38:23 +01:00
|
|
|
AsyncRequests::GetInstance()->SetEnable(true);
|
2015-01-31 12:01:01 +01:00
|
|
|
AsyncRequests::GetInstance()->SetPassthrough(false);
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Run(
|
2023-12-19 03:31:32 +01:00
|
|
|
[this] {
|
2019-06-29 10:35:12 +02:00
|
|
|
// Run events from the CPU thread.
|
|
|
|
AsyncRequests::GetInstance()->PullEvents();
|
|
|
|
|
2015-05-27 20:53:09 +02:00
|
|
|
// Do nothing while paused
|
2022-12-09 20:01:25 +01:00
|
|
|
if (!m_emu_running_state.IsSet())
|
2015-05-27 20:53:09 +02:00
|
|
|
return;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_use_deterministic_gpu_thread)
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
{
|
|
|
|
// All the fifo/CP stuff is on the CPU. We just need to run the opcode decoder.
|
2022-12-09 20:01:25 +01:00
|
|
|
u8* seen_ptr = m_video_buffer_seen_ptr;
|
|
|
|
u8* write_ptr = m_video_buffer_write_ptr;
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// See comment in SyncGPU
|
|
|
|
if (write_ptr > seen_ptr)
|
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_read_ptr =
|
|
|
|
OpcodeDecoder::RunFifo(DataReader(m_video_buffer_read_ptr, write_ptr), nullptr);
|
|
|
|
m_video_buffer_seen_ptr = write_ptr;
|
2016-06-24 10:43:46 +02:00
|
|
|
}
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
2015-05-27 20:53:09 +02:00
|
|
|
else
|
2010-01-23 13:50:56 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
auto& command_processor = m_system.GetCommandProcessor();
|
2022-11-27 13:50:50 +01:00
|
|
|
auto& fifo = command_processor.GetFifo();
|
2023-12-20 14:40:38 +01:00
|
|
|
command_processor.SetCPStatusFromGPU();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// check if we are able to run this buffer
|
2022-11-27 13:50:50 +01:00
|
|
|
while (!command_processor.IsInterruptWaiting() &&
|
2021-05-13 19:30:30 +02:00
|
|
|
fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) &&
|
2023-12-19 03:31:32 +01:00
|
|
|
fifo.CPReadWriteDistance.load(std::memory_order_relaxed) &&
|
|
|
|
!AtBreakpoint(m_system))
|
2013-02-16 02:51:09 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_config_sync_gpu && m_sync_ticks.load() < m_config_sync_gpu_min_distance)
|
2015-06-03 23:21:46 +02:00
|
|
|
break;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2015-06-03 23:21:46 +02:00
|
|
|
u32 cyclesExecuted = 0;
|
2021-05-13 18:44:59 +02:00
|
|
|
u32 readPtr = fifo.CPReadPointer.load(std::memory_order_relaxed);
|
2023-12-19 03:31:32 +01:00
|
|
|
ReadDataFromFifo(readPtr);
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2021-05-13 18:44:59 +02:00
|
|
|
if (readPtr == fifo.CPEnd.load(std::memory_order_relaxed))
|
|
|
|
readPtr = fifo.CPBase.load(std::memory_order_relaxed);
|
2015-06-03 23:21:46 +02:00
|
|
|
else
|
2022-02-13 01:28:24 +01:00
|
|
|
readPtr += GPFifo::GATHER_PIPE_SIZE;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-02-13 01:28:24 +01:00
|
|
|
const s32 distance =
|
|
|
|
static_cast<s32>(fifo.CPReadWriteDistance.load(std::memory_order_relaxed)) -
|
|
|
|
GPFifo::GATHER_PIPE_SIZE;
|
|
|
|
ASSERT_MSG(COMMANDPROCESSOR, distance >= 0,
|
2021-11-10 23:42:49 +01:00
|
|
|
"Negative fifo.CPReadWriteDistance = {} in FIFO Loop !\nThat can produce "
|
2018-03-15 01:34:35 +01:00
|
|
|
"instability in the game. Please report it.",
|
2022-02-13 01:28:24 +01:00
|
|
|
distance);
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
u8* write_ptr = m_video_buffer_write_ptr;
|
|
|
|
m_video_buffer_read_ptr = OpcodeDecoder::RunFifo(
|
|
|
|
DataReader(m_video_buffer_read_ptr, write_ptr), &cyclesExecuted);
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2021-05-13 18:44:59 +02:00
|
|
|
fifo.CPReadPointer.store(readPtr, std::memory_order_relaxed);
|
2022-02-13 01:28:24 +01:00
|
|
|
fifo.CPReadWriteDistance.fetch_sub(GPFifo::GATHER_PIPE_SIZE, std::memory_order_seq_cst);
|
2022-12-09 20:01:25 +01:00
|
|
|
if ((write_ptr - m_video_buffer_read_ptr) == 0)
|
2021-05-13 18:44:59 +02:00
|
|
|
{
|
|
|
|
fifo.SafeCPReadPointer.store(fifo.CPReadPointer.load(std::memory_order_relaxed),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2023-12-20 14:40:38 +01:00
|
|
|
command_processor.SetCPStatusFromGPU();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_config_sync_gpu)
|
2015-03-29 12:20:24 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
cyclesExecuted = (int)(cyclesExecuted / m_config_sync_gpu_overclock);
|
|
|
|
int old = m_sync_ticks.fetch_sub(cyclesExecuted);
|
|
|
|
if (old >= m_config_sync_gpu_max_distance &&
|
|
|
|
old - (int)cyclesExecuted < m_config_sync_gpu_max_distance)
|
|
|
|
{
|
|
|
|
m_sync_wakeup_event.Set();
|
|
|
|
}
|
2015-03-29 12:20:24 +02:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// This call is pretty important in DualCore mode and must be called in the FIFO Loop.
|
|
|
|
// If we don't, s_swapRequested or s_efbAccessRequested won't be set to false
|
2021-08-04 00:55:38 +02:00
|
|
|
// leading the CPU thread to wait in Video_OutputXFB or Video_AccessEFB thus slowing
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
// things down.
|
2015-01-31 11:38:23 +01:00
|
|
|
AsyncRequests::GetInstance()->PullEvents();
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2015-06-03 23:21:46 +02:00
|
|
|
// fast skip remaining GPU time if fifo is empty
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_sync_ticks.load() > 0)
|
2015-06-03 23:21:46 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
int old = m_sync_ticks.exchange(0);
|
|
|
|
if (old >= m_config_sync_gpu_max_distance)
|
|
|
|
m_sync_wakeup_event.Set();
|
2015-06-03 23:21:46 +02:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2015-05-27 20:53:09 +02:00
|
|
|
// The fifo is empty and it's unlikely we will get any more work in the near future.
|
|
|
|
// Make sure VertexManager finishes drawing any primitives it has stored in it's buffer.
|
2016-08-22 05:02:37 +02:00
|
|
|
g_vertex_manager->Flush();
|
2022-09-26 23:41:56 +02:00
|
|
|
g_framebuffer_manager->RefreshPeekCache();
|
Add the 'desynced GPU thread' mode.
It's a relatively big commit (less big with -w), but it's hard to test
any of this separately...
The basic problem is that in netplay or movies, the state of the CPU
must be deterministic, including when the game receives notification
that the GPU has processed FIFO data. Dual core mode notifies the game
whenever the GPU thread actually gets around to doing the work, so it
isn't deterministic. Single core mode is because it notifies the game
'instantly' (after processing the data synchronously), but it's too slow
for many systems and games.
My old dc-netplay branch worked as follows: everything worked as normal
except the state of the CP registers was a lie, and the CPU thread only
delivered results when idle detection triggered (waiting for the GPU if
they weren't ready at that point). Usually, a game is idle iff all the
work for the frame has been done, except for a small amount of work
depending on the GPU result, so neither the CPU or the GPU waiting on
the other affected performance much. However, it's possible that the
game could be waiting for some earlier interrupt, and any of several
games which, for whatever reason, never went into a detectable idle
(even when I tried to improve the detection) would never receive results
at all. (The current method should have better compatibility, but it
also has slightly higher overhead and breaks some other things, so I
want to reimplement this, hopefully with less impact on the code, in the
future.)
With this commit, the basic idea is that the CPU thread acts as if the
work has been done instantly, like single core mode, but actually hands
it off asynchronously to the GPU thread (after backing up some data that
the game might change in memory before it's actually done). Since the
work isn't done, any feedback from the GPU to the CPU, such as real
XFB/EFB copies (virtual are OK), EFB pokes, performance queries, etc. is
broken; but most games work with these options disabled, and there is no
need to try to detect what the CPU thread is doing.
Technically: when the flag g_use_deterministic_gpu_thread (currently
stuck on) is on, the CPU thread calls RunGpu like in single core mode.
This function synchronously copies the data from the FIFO to the
internal video buffer and updates the CP registers, interrupts, etc.
However, instead of the regular ReadDataFromFifo followed by running the
opcode decoder, it runs ReadDataFromFifoOnCPU ->
OpcodeDecoder_Preprocess, which relatively quickly scans through the
FIFO data, detects SetFinish calls etc., which are immediately fired,
and saves certain associated data from memory (e.g. display lists) in
AuxBuffers (a parallel stream to the main FIFO, which is a bit slow at
the moment), before handing the data off to the GPU thread to actually
render. That makes up the bulk of this commit.
In various circumstances, including the aforementioned EFB pokes and
performance queries as well as swap requests (i.e. the end of a frame -
we don't want the CPU potentially pumping out frames too quickly and the
GPU falling behind*), SyncGPU is called to wait for actual completion.
The overhead mainly comes from OpcodeDecoder_Preprocess (which is,
again, synchronous), as well as the actual copying.
Currently, display lists and such are escrowed from main memory even
though they usually won't change over the course of a frame, and
textures are not even though they might, resulting in a small chance of
graphical glitches. When the texture locking (i.e. fault on write) code
lands, I can make this all correct and maybe a little faster.
* This suggests an alternate determinism method of just delaying results
until a short time before the end of each frame. For all I know this
might mostly work - I haven't tried it - but if any significant work
hinges on the competion of render to texture etc., the frame will be
missed.
2014-08-28 04:56:19 +02:00
|
|
|
}
|
2015-05-30 02:42:32 +02:00
|
|
|
},
|
|
|
|
100);
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2015-01-31 11:38:23 +01:00
|
|
|
AsyncRequests::GetInstance()->SetEnable(false);
|
2015-01-31 12:01:01 +01:00
|
|
|
AsyncRequests::GetInstance()->SetPassthrough(true);
|
2008-07-12 19:40:22 +02:00
|
|
|
}
|
2009-06-15 08:39:26 +02:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::FlushGpu()
|
2015-03-05 21:14:46 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
if (!m_system.IsDualCoreMode() || m_use_deterministic_gpu_thread)
|
2015-03-05 21:14:46 +01:00
|
|
|
return;
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Wait();
|
2015-05-27 20:53:09 +02:00
|
|
|
}
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::GpuMaySleep()
|
2015-05-27 20:53:09 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.AllowSleep();
|
2015-03-05 21:14:46 +01:00
|
|
|
}
|
2010-12-11 13:42:55 +01:00
|
|
|
|
2022-12-09 22:59:11 +01:00
|
|
|
bool AtBreakpoint(Core::System& system)
|
2010-12-11 13:42:55 +01:00
|
|
|
{
|
2022-11-27 13:50:50 +01:00
|
|
|
auto& command_processor = system.GetCommandProcessor();
|
|
|
|
const auto& fifo = command_processor.GetFifo();
|
2021-05-13 19:30:30 +02:00
|
|
|
return fifo.bFF_BPEnable.load(std::memory_order_relaxed) &&
|
|
|
|
(fifo.CPReadPointer.load(std::memory_order_relaxed) ==
|
|
|
|
fifo.CPBreakpoint.load(std::memory_order_relaxed));
|
2010-12-11 13:42:55 +01:00
|
|
|
}
|
Big Fifo Commit Part2: Now the fifo is more stable than my first commit, so is time...
- ReImplementing Single Core Mode like Dual Core Mode Style.
- Stage 1: My goal is, we have the Fifo, CommandProccessor code the more clear, maintenible and documented possible. When I quit dolphin I want any developer can continue with the work only reading the code.
* Big Refactoring: A lot of functions was changed the names, and modularized.
Now the FifoLoop and CatchUpGPU does not exist, was replaced by RunGpu() and RunGpuLoop().
The general idea is modeling the code like the real HW. The fifo is only a buffer where the Write Gather Pipe write the commands and from the Graphic Processor read these.
* Big Clean UP a lot of obsolete code and comments was deleted, like DcFakeWachDog, "Fifo very soon hack", etc.
In the stage 2, I will refactoring more code doing emphasis in the division of CommandProcessor, Fifo, Gpu Emulation. Beside I will comment all functions and variables in the code (Don't worry I will ask for English help for this part ;) )
Please test a lot SC mode and DC mode :)
Thank you so much for testing always and the patience. I don't like broke your favorite game but... you must believe me this part is very sensible, I only try to contribute for have a better and stable dolphin emulator.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7185 8ced0084-cf51-0410-be5f-012b33b47a6e
2011-02-17 05:25:21 +01:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::RunGpu()
|
Big Fifo Commit Part2: Now the fifo is more stable than my first commit, so is time...
- ReImplementing Single Core Mode like Dual Core Mode Style.
- Stage 1: My goal is, we have the Fifo, CommandProccessor code the more clear, maintenible and documented possible. When I quit dolphin I want any developer can continue with the work only reading the code.
* Big Refactoring: A lot of functions was changed the names, and modularized.
Now the FifoLoop and CatchUpGPU does not exist, was replaced by RunGpu() and RunGpuLoop().
The general idea is modeling the code like the real HW. The fifo is only a buffer where the Write Gather Pipe write the commands and from the Graphic Processor read these.
* Big Clean UP a lot of obsolete code and comments was deleted, like DcFakeWachDog, "Fifo very soon hack", etc.
In the stage 2, I will refactoring more code doing emphasis in the division of CommandProcessor, Fifo, Gpu Emulation. Beside I will comment all functions and variables in the code (Don't worry I will ask for English help for this part ;) )
Please test a lot SC mode and DC mode :)
Thank you so much for testing always and the patience. I don't like broke your favorite game but... you must believe me this part is very sensible, I only try to contribute for have a better and stable dolphin emulator.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@7185 8ced0084-cf51-0410-be5f-012b33b47a6e
2011-02-17 05:25:21 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
const bool is_dual_core = m_system.IsDualCoreMode();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2016-01-10 12:18:45 +01:00
|
|
|
// wake up GPU thread
|
2022-12-09 20:01:25 +01:00
|
|
|
if (is_dual_core && !m_use_deterministic_gpu_thread)
|
2013-02-16 02:51:09 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Wakeup();
|
2016-01-10 12:18:45 +01:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2016-01-10 12:18:45 +01:00
|
|
|
// if the sync GPU callback is suspended, wake it up.
|
2022-12-09 20:01:25 +01:00
|
|
|
if (!is_dual_core || m_use_deterministic_gpu_thread || m_config_sync_gpu)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_syncing_suspended)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_syncing_suspended = false;
|
2023-12-19 03:31:32 +01:00
|
|
|
m_system.GetCoreTiming().ScheduleEvent(GPU_TIME_SLOT_SIZE, m_event_sync_gpu,
|
|
|
|
GPU_TIME_SLOT_SIZE);
|
2015-03-05 17:12:24 +01:00
|
|
|
}
|
2016-01-10 12:18:45 +01:00
|
|
|
}
|
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
int FifoManager::RunGpuOnCpu(int ticks)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
auto& command_processor = m_system.GetCommandProcessor();
|
2022-11-27 13:50:50 +01:00
|
|
|
auto& fifo = command_processor.GetFifo();
|
2016-01-10 12:18:45 +01:00
|
|
|
bool reset_simd_state = false;
|
2022-12-09 20:01:25 +01:00
|
|
|
int available_ticks = int(ticks * m_config_sync_gpu_overclock) + m_sync_ticks.load();
|
2021-05-13 19:30:30 +02:00
|
|
|
while (fifo.bFF_GPReadEnable.load(std::memory_order_relaxed) &&
|
2023-12-19 03:31:32 +01:00
|
|
|
fifo.CPReadWriteDistance.load(std::memory_order_relaxed) && !AtBreakpoint(m_system) &&
|
2021-05-13 19:30:30 +02:00
|
|
|
available_ticks >= 0)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_use_deterministic_gpu_thread)
|
2015-03-14 08:02:16 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
ReadDataFromFifoOnCPU(fifo.CPReadPointer.load(std::memory_order_relaxed));
|
2022-12-09 20:01:25 +01:00
|
|
|
m_gpu_mainloop.Wakeup();
|
2015-03-14 08:02:16 +01:00
|
|
|
}
|
2016-01-10 12:18:45 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!reset_simd_state)
|
|
|
|
{
|
2023-03-21 15:49:35 +01:00
|
|
|
Common::FPU::SaveSIMDState();
|
|
|
|
Common::FPU::LoadDefaultSIMDState();
|
2016-01-10 12:18:45 +01:00
|
|
|
reset_simd_state = true;
|
|
|
|
}
|
2023-12-19 03:31:32 +01:00
|
|
|
ReadDataFromFifo(fifo.CPReadPointer.load(std::memory_order_relaxed));
|
2016-01-10 12:18:45 +01:00
|
|
|
u32 cycles = 0;
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_read_ptr = OpcodeDecoder::RunFifo(
|
|
|
|
DataReader(m_video_buffer_read_ptr, m_video_buffer_write_ptr), &cycles);
|
2016-01-10 12:18:45 +01:00
|
|
|
available_ticks -= cycles;
|
|
|
|
}
|
|
|
|
|
2021-05-13 18:44:59 +02:00
|
|
|
if (fifo.CPReadPointer.load(std::memory_order_relaxed) ==
|
|
|
|
fifo.CPEnd.load(std::memory_order_relaxed))
|
|
|
|
{
|
|
|
|
fifo.CPReadPointer.store(fifo.CPBase.load(std::memory_order_relaxed),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2016-01-10 12:18:45 +01:00
|
|
|
else
|
2021-05-13 18:44:59 +02:00
|
|
|
{
|
2022-02-13 01:28:24 +01:00
|
|
|
fifo.CPReadPointer.fetch_add(GPFifo::GATHER_PIPE_SIZE, std::memory_order_relaxed);
|
2021-05-13 18:44:59 +02:00
|
|
|
}
|
2016-01-10 12:18:45 +01:00
|
|
|
|
2022-02-13 01:28:24 +01:00
|
|
|
fifo.CPReadWriteDistance.fetch_sub(GPFifo::GATHER_PIPE_SIZE, std::memory_order_relaxed);
|
2015-03-05 17:12:24 +01:00
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2023-12-20 14:40:38 +01:00
|
|
|
command_processor.SetCPStatusFromGPU();
|
2016-01-10 12:18:45 +01:00
|
|
|
|
|
|
|
if (reset_simd_state)
|
2015-03-05 17:12:24 +01:00
|
|
|
{
|
2023-03-21 15:49:35 +01:00
|
|
|
Common::FPU::LoadSIMDState();
|
2013-02-16 02:51:09 +01:00
|
|
|
}
|
2016-01-10 12:18:45 +01:00
|
|
|
|
|
|
|
// Discard all available ticks as there is nothing to do any more.
|
2022-12-09 20:01:25 +01:00
|
|
|
m_sync_ticks.store(std::min(available_ticks, 0));
|
2016-01-10 12:18:45 +01:00
|
|
|
|
|
|
|
// If the GPU is idle, drop the handler.
|
|
|
|
if (available_ticks >= 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
// Always wait at least for GPU_TIME_SLOT_SIZE cycles.
|
|
|
|
return -available_ticks + GPU_TIME_SLOT_SIZE;
|
2011-02-19 00:52:14 +01:00
|
|
|
}
|
2014-09-06 23:26:40 +02:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::UpdateWantDeterminism(bool want)
|
2014-09-06 23:26:40 +02:00
|
|
|
{
|
2015-03-29 15:05:11 +02:00
|
|
|
// We are paused (or not running at all yet), so
|
2014-09-06 23:26:40 +02:00
|
|
|
// it should be safe to change this.
|
2014-09-30 17:43:53 +02:00
|
|
|
bool gpu_thread = false;
|
2022-01-06 01:08:56 +01:00
|
|
|
switch (Config::GetGPUDeterminismMode())
|
2014-09-06 23:43:43 +02:00
|
|
|
{
|
2022-01-06 01:08:56 +01:00
|
|
|
case Config::GPUDeterminismMode::Auto:
|
2014-09-06 23:43:43 +02:00
|
|
|
gpu_thread = want;
|
|
|
|
break;
|
2022-01-06 01:08:56 +01:00
|
|
|
case Config::GPUDeterminismMode::Disabled:
|
2014-09-06 23:43:43 +02:00
|
|
|
gpu_thread = false;
|
|
|
|
break;
|
2022-01-06 01:08:56 +01:00
|
|
|
case Config::GPUDeterminismMode::FakeCompletion:
|
2014-09-06 23:43:43 +02:00
|
|
|
gpu_thread = true;
|
|
|
|
break;
|
|
|
|
}
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
gpu_thread = gpu_thread && m_system.IsDualCoreMode();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
if (m_use_deterministic_gpu_thread != gpu_thread)
|
2014-09-06 23:26:40 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
m_use_deterministic_gpu_thread = gpu_thread;
|
2014-09-06 23:43:43 +02:00
|
|
|
if (gpu_thread)
|
|
|
|
{
|
|
|
|
// These haven't been updated in non-deterministic mode.
|
2022-12-09 20:01:25 +01:00
|
|
|
m_video_buffer_seen_ptr = m_video_buffer_pp_read_ptr = m_video_buffer_read_ptr;
|
2014-09-06 23:43:43 +02:00
|
|
|
CopyPreprocessCPStateFromMain();
|
|
|
|
VertexLoaderManager::MarkAllDirty();
|
|
|
|
}
|
2014-09-06 23:26:40 +02:00
|
|
|
}
|
|
|
|
}
|
2015-06-03 23:21:46 +02:00
|
|
|
|
2016-01-20 19:42:37 +01:00
|
|
|
/* This function checks the emulated CPU - GPU distance and may wake up the GPU,
|
2016-01-10 12:18:45 +01:00
|
|
|
* or block the CPU if required. It should be called by the CPU thread regularly.
|
2016-01-20 19:42:37 +01:00
|
|
|
* @ticks The gone emulated CPU time.
|
2016-01-10 12:18:45 +01:00
|
|
|
* @return A good time to call WaitForGpuThread() next.
|
2016-01-20 19:42:37 +01:00
|
|
|
*/
|
2023-12-19 03:31:32 +01:00
|
|
|
int FifoManager::WaitForGpuThread(int ticks)
|
2015-06-03 23:21:46 +02:00
|
|
|
{
|
2022-12-09 20:01:25 +01:00
|
|
|
int old = m_sync_ticks.fetch_add(ticks);
|
2016-10-01 13:16:50 +02:00
|
|
|
int now = old + ticks;
|
|
|
|
|
|
|
|
// GPU is idle, so stop polling.
|
2022-12-09 20:01:25 +01:00
|
|
|
if (old >= 0 && m_gpu_mainloop.IsDone())
|
2016-10-01 13:16:50 +02:00
|
|
|
return -1;
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2016-01-20 19:42:37 +01:00
|
|
|
// Wakeup GPU
|
2022-12-09 20:01:25 +01:00
|
|
|
if (old < m_config_sync_gpu_min_distance && now >= m_config_sync_gpu_min_distance)
|
2023-12-19 03:31:32 +01:00
|
|
|
RunGpu();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2016-10-01 13:16:50 +02:00
|
|
|
// If the GPU is still sleeping, wait for a longer time
|
2022-12-09 20:01:25 +01:00
|
|
|
if (now < m_config_sync_gpu_min_distance)
|
|
|
|
return GPU_TIME_SLOT_SIZE + m_config_sync_gpu_min_distance - now;
|
2016-10-01 13:16:50 +02:00
|
|
|
|
2016-01-20 19:42:37 +01:00
|
|
|
// Wait for GPU
|
2022-12-09 20:01:25 +01:00
|
|
|
if (now >= m_config_sync_gpu_max_distance)
|
|
|
|
m_sync_wakeup_event.Wait();
|
2016-06-24 10:43:46 +02:00
|
|
|
|
2016-10-01 13:16:50 +02:00
|
|
|
return GPU_TIME_SLOT_SIZE;
|
2015-06-03 23:21:46 +02:00
|
|
|
}
|
2016-01-12 22:44:58 +01:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
void FifoManager::SyncGPUCallback(Core::System& system, u64 ticks, s64 cyclesLate)
|
2016-01-19 00:08:18 +01:00
|
|
|
{
|
2016-01-10 12:18:45 +01:00
|
|
|
ticks += cyclesLate;
|
|
|
|
int next = -1;
|
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
auto& fifo = system.GetFifo();
|
|
|
|
if (!system.IsDualCoreMode() || fifo.m_use_deterministic_gpu_thread)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
next = fifo.RunGpuOnCpu(int(ticks));
|
2016-01-10 12:18:45 +01:00
|
|
|
}
|
2022-12-09 20:01:25 +01:00
|
|
|
else if (fifo.m_config_sync_gpu)
|
2016-01-10 12:18:45 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
next = fifo.WaitForGpuThread(int(ticks));
|
2016-01-10 12:18:45 +01:00
|
|
|
}
|
2016-01-19 00:08:18 +01:00
|
|
|
|
2022-12-09 20:01:25 +01:00
|
|
|
fifo.m_syncing_suspended = next < 0;
|
|
|
|
if (!fifo.m_syncing_suspended)
|
|
|
|
system.GetCoreTiming().ScheduleEvent(next, fifo.m_event_sync_gpu, next);
|
2016-01-19 00:08:18 +01:00
|
|
|
}
|
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::SyncGPUForRegisterAccess()
|
2021-04-17 07:54:50 +02:00
|
|
|
{
|
|
|
|
SyncGPU(SyncGPUReason::Other);
|
|
|
|
|
2023-12-19 03:31:32 +01:00
|
|
|
if (!m_system.IsDualCoreMode() || m_use_deterministic_gpu_thread)
|
|
|
|
RunGpuOnCpu(GPU_TIME_SLOT_SIZE);
|
2022-12-09 20:01:25 +01:00
|
|
|
else if (m_config_sync_gpu)
|
2023-12-19 03:31:32 +01:00
|
|
|
WaitForGpuThread(GPU_TIME_SLOT_SIZE);
|
2021-04-17 07:54:50 +02:00
|
|
|
}
|
|
|
|
|
2016-01-20 19:42:37 +01:00
|
|
|
// Initialize GPU - CPU thread syncing, this gives us a deterministic way to start the GPU thread.
|
2023-12-19 03:31:32 +01:00
|
|
|
void FifoManager::Prepare()
|
2016-01-19 00:08:18 +01:00
|
|
|
{
|
2023-12-19 03:31:32 +01:00
|
|
|
m_event_sync_gpu = m_system.GetCoreTiming().RegisterEvent("SyncGPUCallback", SyncGPUCallback);
|
2022-12-09 20:01:25 +01:00
|
|
|
m_syncing_suspended = true;
|
2016-01-19 00:08:18 +01:00
|
|
|
}
|
2019-05-06 01:48:12 +02:00
|
|
|
} // namespace Fifo
|