aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authoryuzubot <[email protected]>2024-03-04 00:57:21 +0000
committeryuzubot <[email protected]>2024-03-04 00:57:21 +0000
commit276ceb26d0c58a00a0e65e3bf4d9c4371428f82d (patch)
tree587c6b5415501f1b1a1795ddd6df8d3403252cb4 /src
parent15e6e48bef0216480661444a8d8b348c1cca47bb (diff)
downloadyuzu-android-276ceb26d0c58a00a0e65e3bf4d9c4371428f82d.tar.gz
yuzu-android-276ceb26d0c58a00a0e65e3bf4d9c4371428f82d.zip
Merge yuzu-emu#12461
Diffstat (limited to 'src')
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/core/device_memory_manager.h2
-rw-r--r--src/core/guest_memory.h62
-rw-r--r--src/core/hle/service/nvdrv/core/container.h3
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp15
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp27
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h1
-rw-r--r--src/core/hle/service/nvdrv/devices/nvhost_vic.cpp15
-rw-r--r--src/core/memory.h2
-rw-r--r--src/video_core/CMakeLists.txt10
-rw-r--r--src/video_core/cdma_pusher.cpp194
-rw-r--r--src/video_core/cdma_pusher.h89
-rw-r--r--src/video_core/gpu.cpp33
-rw-r--r--src/video_core/gpu.h9
-rw-r--r--src/video_core/gpu_thread.cpp1
-rw-r--r--src/video_core/host1x/codecs/codec.cpp113
-rw-r--r--src/video_core/host1x/codecs/codec.h63
-rw-r--r--src/video_core/host1x/codecs/decoder.cpp71
-rw-r--r--src/video_core/host1x/codecs/decoder.h64
-rw-r--r--src/video_core/host1x/codecs/h264.cpp147
-rw-r--r--src/video_core/host1x/codecs/h264.h299
-rw-r--r--src/video_core/host1x/codecs/vp8.cpp73
-rw-r--r--src/video_core/host1x/codecs/vp8.h44
-rw-r--r--src/video_core/host1x/codecs/vp9.cpp152
-rw-r--r--src/video_core/host1x/codecs/vp9.h46
-rw-r--r--src/video_core/host1x/codecs/vp9_types.h27
-rw-r--r--src/video_core/host1x/control.cpp1
-rw-r--r--src/video_core/host1x/control.h10
-rw-r--r--src/video_core/host1x/ffmpeg/ffmpeg.cpp233
-rw-r--r--src/video_core/host1x/ffmpeg/ffmpeg.h61
-rw-r--r--src/video_core/host1x/host1x.cpp26
-rw-r--r--src/video_core/host1x/host1x.h148
-rw-r--r--src/video_core/host1x/nvdec.cpp62
-rw-r--r--src/video_core/host1x/nvdec.h34
-rw-r--r--src/video_core/host1x/nvdec_common.h84
-rw-r--r--src/video_core/host1x/sync_manager.cpp50
-rw-r--r--src/video_core/host1x/sync_manager.h53
-rw-r--r--src/video_core/host1x/syncpoint_manager.cpp6
-rw-r--r--src/video_core/host1x/vic.cpp1360
-rw-r--r--src/video_core/host1x/vic.h651
-rw-r--r--src/video_core/memory_manager.h2
41 files changed, 3124 insertions, 1220 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cf05a3fe3..697a9ab97 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -165,6 +165,7 @@ else()
if (MINGW)
add_definitions(-DMINGW_HAS_SECURE_API)
+ add_compile_options("-msse4.1")
if (MINGW_STATIC_BUILD)
add_definitions(-DQT_STATICPLUGIN)
diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h
index 0568a821b..6dcf7bb22 100644
--- a/src/core/device_memory_manager.h
+++ b/src/core/device_memory_manager.h
@@ -43,6 +43,8 @@ public:
DeviceMemoryManager(const DeviceMemory& device_memory);
~DeviceMemoryManager();
+ static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
void BindInterface(DeviceInterface* device_inter);
DAddr Allocate(size_t size);
diff --git a/src/core/guest_memory.h b/src/core/guest_memory.h
index 7ee18c126..83292f702 100644
--- a/src/core/guest_memory.h
+++ b/src/core/guest_memory.h
@@ -44,15 +44,32 @@ public:
GuestMemory() = delete;
explicit GuestMemory(M& memory, u64 addr, std::size_t size,
Common::ScratchBuffer<T>* backup = nullptr)
- : m_memory{memory}, m_addr{addr}, m_size{size} {
+ : m_memory{&memory}, m_addr{addr}, m_size{size} {
static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write);
- if constexpr (FLAGS & GuestMemoryFlags::Read) {
+ if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
+ if (!this->TrySetSpan()) {
+ if (backup) {
+ backup->resize_destructive(this->size());
+ m_data_span = *backup;
+ m_span_valid = true;
+ m_is_data_copy = true;
+ } else {
+ m_data_copy.resize(this->size());
+ m_data_span = std::span(m_data_copy);
+ m_span_valid = true;
+ m_is_data_copy = true;
+ }
+ }
+ } else if constexpr (FLAGS & GuestMemoryFlags::Read) {
Read(addr, size, backup);
}
}
~GuestMemory() = default;
+ GuestMemory(GuestMemory&& rhs) = default;
+ GuestMemory& operator=(GuestMemory&& rhs) = default;
+
T* data() noexcept {
return m_data_span.data();
}
@@ -109,8 +126,8 @@ public:
}
if (this->TrySetSpan()) {
- if constexpr (FLAGS & GuestMemoryFlags::Safe) {
- m_memory.FlushRegion(m_addr, this->size_bytes());
+ if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) {
+ m_memory->FlushRegion(m_addr, this->size_bytes());
}
} else {
if (backup) {
@@ -123,9 +140,9 @@ public:
m_is_data_copy = true;
m_span_valid = true;
if constexpr (FLAGS & GuestMemoryFlags::Safe) {
- m_memory.ReadBlock(m_addr, this->data(), this->size_bytes());
+ m_memory->ReadBlock(m_addr, this->data(), this->size_bytes());
} else {
- m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
+ m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
}
}
return m_data_span;
@@ -133,18 +150,19 @@ public:
void Write(std::span<T> write_data) noexcept {
if constexpr (FLAGS & GuestMemoryFlags::Cached) {
- m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
+ m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
} else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
- m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes());
+ m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes());
} else {
- m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
+ m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
}
}
bool TrySetSpan() noexcept {
- if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) {
+ if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) {
m_data_span = {reinterpret_cast<T*>(ptr), this->size()};
m_span_valid = true;
+ m_is_data_copy = false;
return true;
}
return false;
@@ -159,7 +177,7 @@ protected:
return m_addr_changed;
}
- M& m_memory;
+ M* m_memory;
u64 m_addr{};
size_t m_size{};
std::span<T> m_data_span{};
@@ -175,17 +193,7 @@ public:
GuestMemoryScoped() = delete;
explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size,
Common::ScratchBuffer<T>* backup = nullptr)
- : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {
- if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
- if (!this->TrySetSpan()) {
- if (backup) {
- this->m_data_span = *backup;
- this->m_span_valid = true;
- this->m_is_data_copy = true;
- }
- }
- }
- }
+ : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {}
~GuestMemoryScoped() {
if constexpr (FLAGS & GuestMemoryFlags::Write) {
@@ -196,15 +204,17 @@ public:
if (this->AddressChanged() || this->IsDataCopy()) {
ASSERT(this->m_span_valid);
if constexpr (FLAGS & GuestMemoryFlags::Cached) {
- this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes());
+ this->m_memory->WriteBlockCached(this->m_addr, this->data(),
+ this->size_bytes());
} else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
- this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes());
+ this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes());
} else {
- this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes());
+ this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(),
+ this->size_bytes());
}
} else if constexpr ((FLAGS & GuestMemoryFlags::Safe) ||
(FLAGS & GuestMemoryFlags::Cached)) {
- this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes());
+ this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes());
}
}
}
diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h
index f159ced09..cf549d7f3 100644
--- a/src/core/hle/service/nvdrv/core/container.h
+++ b/src/core/hle/service/nvdrv/core/container.h
@@ -68,10 +68,7 @@ public:
const SyncpointManager& GetSyncpointManager() const;
struct Host1xDeviceFileData {
- std::unordered_map<DeviceFD, u32> fd_to_id{};
std::deque<u32> syncpts_accumulated{};
- u32 nvdec_next_id{};
- u32 vic_next_id{};
};
Host1xDeviceFileData& Host1xDeviceFile();
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index 2c0ac2a46..60b89b628 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -8,6 +8,7 @@
#include "core/hle/service/nvdrv/core/container.h"
#include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
#include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "video_core/host1x/host1x.h"
#include "video_core/renderer_base.h"
namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
switch (command.group) {
case 0x0:
switch (command.cmd) {
- case 0x1: {
- auto& host1x_file = core.Host1xDeviceFile();
- if (!host1x_file.fd_to_id.contains(fd)) {
- host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++;
- }
+ case 0x1:
return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd);
- }
case 0x2:
return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output);
case 0x3:
@@ -72,15 +68,12 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
LOG_INFO(Service_NVDRV, "NVDEC video stream started");
system.SetNVDECActive(true);
sessions[fd] = session_id;
+ host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint);
}
void nvhost_nvdec::OnClose(DeviceFD fd) {
LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
- auto& host1x_file = core.Host1xDeviceFile();
- const auto iter = host1x_file.fd_to_id.find(fd);
- if (iter != host1x_file.fd_to_id.end()) {
- system.GPU().ClearCdmaInstance(iter->second);
- }
+ host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec);
system.SetNVDECActive(false);
auto it = sessions.find(fd);
if (it != sessions.end()) {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
index a0a7bfa40..9ca6308e6 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -55,8 +55,9 @@ std::size_t WriteVectors(std::span<u8> dst, const std::vector<T>& src, std::size
nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_,
NvCore::ChannelType channel_type_)
- : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()},
- nvmap{core.GetNvMapFile()}, channel_type{channel_type_} {
+ : nvdevice{system_}, host1x{system_.Host1x()}, core{core_},
+ syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
+ channel_type{channel_type_} {
auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated;
if (syncpts_accumulated.empty()) {
channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
@@ -95,24 +96,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span<u8> data, De
offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset);
offset += SliceVectors(data, fence_thresholds, params.fence_count, offset);
- auto& gpu = system.GPU();
auto* session = core.GetSession(sessions[fd]);
- if (gpu.UseNvdec()) {
- for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
- const SyncptIncr& syncpt_incr = syncpt_increments[i];
- fence_thresholds[i] =
- syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
- }
+ for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
+ const SyncptIncr& syncpt_incr = syncpt_increments[i];
+ fence_thresholds[i] =
+ syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
}
+
for (const auto& cmd_buffer : command_buffers) {
const auto object = nvmap.GetHandle(cmd_buffer.memory_id);
ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;);
- Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
- session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(),
- cmdlist.size() * sizeof(u32));
- gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist);
+ Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader,
+ Core::Memory::GuestMemoryFlags::SafeRead>
+ cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset,
+ cmd_buffer.word_count);
+ host1x.PushEntries(fd, std::move(cmdlist));
}
+
// Some games expect command_buffers to be written back
offset = 0;
offset += WriteVectors(data, command_buffers, offset);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
index 900db81d2..63e637760 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -119,6 +119,7 @@ protected:
Kernel::KEvent* QueryEvent(u32 event_id) override;
+ Tegra::Host1x::Host1x& host1x;
u32 channel_syncpoint;
s32_le nvmap_fd{};
u32_le submit_timeout{};
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index bf090f5eb..8219a2c7e 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -7,6 +7,7 @@
#include "core/hle/service/nvdrv/core/container.h"
#include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
#include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+#include "video_core/host1x/host1x.h"
#include "video_core/renderer_base.h"
namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
switch (command.group) {
case 0x0:
switch (command.cmd) {
- case 0x1: {
- auto& host1x_file = core.Host1xDeviceFile();
- if (!host1x_file.fd_to_id.contains(fd)) {
- host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++;
- }
+ case 0x1:
return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd);
- }
case 0x2:
return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output);
case 0x3:
@@ -70,14 +66,11 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> inpu
void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
sessions[fd] = session_id;
+ host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint);
}
void nvhost_vic::OnClose(DeviceFD fd) {
- auto& host1x_file = core.Host1xDeviceFile();
- const auto iter = host1x_file.fd_to_id.find(fd);
- if (iter != host1x_file.fd_to_id.end()) {
- system.GPU().ClearCdmaInstance(iter->second);
- }
+ host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC);
sessions.erase(fd);
}
diff --git a/src/core/memory.h b/src/core/memory.h
index f7e6b297f..dcca26892 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,6 +64,8 @@ public:
Memory(Memory&&) = default;
Memory& operator=(Memory&&) = delete;
+ static constexpr bool HAS_FLUSH_INVALIDATION = false;
+
/**
* Resets the state of the Memory system.
*/
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2de2beb6e..a4b8e0252 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -60,8 +60,8 @@ add_library(video_core STATIC
framebuffer_config.h
fsr.cpp
fsr.h
- host1x/codecs/codec.cpp
- host1x/codecs/codec.h
+ host1x/codecs/decoder.cpp
+ host1x/codecs/decoder.h
host1x/codecs/h264.cpp
host1x/codecs/h264.h
host1x/codecs/vp8.cpp
@@ -80,8 +80,6 @@ add_library(video_core STATIC
host1x/nvdec.cpp
host1x/nvdec.h
host1x/nvdec_common.h
- host1x/sync_manager.cpp
- host1x/sync_manager.h
host1x/syncpoint_manager.cpp
host1x/syncpoint_manager.h
host1x/vic.cpp
@@ -392,4 +390,8 @@ if (ANDROID AND ARCHITECTURE_arm64)
target_link_libraries(video_core PRIVATE adrenotools)
endif()
+if (ARCHITECTURE_arm64)
+ target_link_libraries(video_core PRIVATE sse2neon)
+endif()
+
create_target_directory_groups(video_core)
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
index 28a2d2090..3bcf1b066 100644
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -2,136 +2,130 @@
// SPDX-License-Identifier: MIT
#include <bit>
+
+#include "common/thread.h"
+#include "core/core.h"
#include "video_core/cdma_pusher.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/host1x/control.h"
#include "video_core/host1x/host1x.h"
#include "video_core/host1x/nvdec.h"
#include "video_core/host1x/nvdec_common.h"
-#include "video_core/host1x/sync_manager.h"
#include "video_core/host1x/vic.h"
#include "video_core/memory_manager.h"
namespace Tegra {
-CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_)
- : host1x{host1x_}, nvdec_processor(std::make_shared<Host1x::Nvdec>(host1x)),
- vic_processor(std::make_unique<Host1x::Vic>(host1x, nvdec_processor)),
- host1x_processor(std::make_unique<Host1x::Control>(host1x)),
- sync_manager(std::make_unique<Host1x::SyncptIncrManager>(host1x)) {}
+
+CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id)
+ : host1x{host1x_}, memory_manager{host1x.GMMU()},
+ host_processor{std::make_unique<Host1x::Control>(host1x_)}, current_class{
+ static_cast<ChClassId>(id)} {
+ thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); });
+}
CDmaPusher::~CDmaPusher() = default;
-void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) {
- for (const auto& value : entries) {
- if (mask != 0) {
- const auto lbs = static_cast<u32>(std::countr_zero(mask));
- mask &= ~(1U << lbs);
- ExecuteCommand(offset + lbs, value.raw);
- continue;
- } else if (count != 0) {
- --count;
- ExecuteCommand(offset, value.raw);
- if (incrementing) {
- ++offset;
+void CDmaPusher::ProcessEntries(std::stop_token stop_token) {
+ Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+ ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0};
+ u32 count{};
+ u32 method_offset{};
+ u32 mask{};
+ bool incrementing{};
+
+ while (!stop_token.stop_requested()) {
+ {
+ std::unique_lock l{command_mutex};
+ Common::CondvarWait(command_cv, l, stop_token,
+ [this]() { return command_lists.size() > 0; });
+ if (stop_token.stop_requested()) {
+ return;
}
- continue;
- }
- const auto mode = value.submission_mode.Value();
- switch (mode) {
- case ChSubmissionMode::SetClass: {
- mask = value.value & 0x3f;
- offset = value.method_offset;
- current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
- break;
- }
- case ChSubmissionMode::Incrementing:
- case ChSubmissionMode::NonIncrementing:
- count = value.value;
- offset = value.method_offset;
- incrementing = mode == ChSubmissionMode::Incrementing;
- break;
- case ChSubmissionMode::Mask:
- mask = value.value;
- offset = value.method_offset;
- break;
- case ChSubmissionMode::Immediate: {
- const u32 data = value.value & 0xfff;
- offset = value.method_offset;
- ExecuteCommand(offset, data);
- break;
+
+ command_list = std::move(command_lists.front());
+ command_lists.pop_front();
}
- default:
- UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
- break;
+
+ size_t i = 0;
+ for (const auto value : command_list) {
+ i++;
+ if (mask != 0) {
+ const auto lbs = static_cast<u32>(std::countr_zero(mask));
+ mask &= ~(1U << lbs);
+ ExecuteCommand(method_offset + lbs, value.raw);
+ continue;
+ } else if (count != 0) {
+ --count;
+ ExecuteCommand(method_offset, value.raw);
+ if (incrementing) {
+ ++method_offset;
+ }
+ continue;
+ }
+ const auto mode = value.submission_mode.Value();
+ switch (mode) {
+ case ChSubmissionMode::SetClass: {
+ mask = value.value & 0x3f;
+ method_offset = value.method_offset;
+ current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
+ break;
+ }
+ case ChSubmissionMode::Incrementing:
+ case ChSubmissionMode::NonIncrementing:
+ count = value.value;
+ method_offset = value.method_offset;
+ incrementing = mode == ChSubmissionMode::Incrementing;
+ break;
+ case ChSubmissionMode::Mask:
+ mask = value.value;
+ method_offset = value.method_offset;
+ break;
+ case ChSubmissionMode::Immediate: {
+ const u32 data = value.value & 0xfff;
+ method_offset = value.method_offset;
+ ExecuteCommand(method_offset, data);
+ break;
+ }
+ default:
+ LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1,
+ (i - 1) * sizeof(u32), command_list.size());
+ UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!",
+ static_cast<u32>(mode));
+ break;
+ }
}
}
}
-void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
+void CDmaPusher::ExecuteCommand(u32 method, u32 arg) {
switch (current_class) {
- case ChClassId::NvDec:
- ThiStateWrite(nvdec_thi_state, offset, data);
- switch (static_cast<ThiMethod>(offset)) {
- case ThiMethod::IncSyncpt: {
- LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
- const auto syncpoint_id = static_cast<u32>(data & 0xFF);
- const auto cond = static_cast<u32>((data >> 8) & 0xFF);
- if (cond == 0) {
- sync_manager->Increment(syncpoint_id);
- } else {
- sync_manager->SignalDone(
- sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
- }
- break;
- }
- case ThiMethod::SetMethod1:
- LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
- static_cast<u32>(nvdec_thi_state.method_0));
- nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data);
- break;
- default:
- break;
- }
+ case ChClassId::Control:
+ LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+ static_cast<u32>(current_class), method, arg);
+ host_processor->ProcessMethod(static_cast<Host1x::Control::Method>(method), arg);
break;
- case ChClassId::GraphicsVic:
- ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
- switch (static_cast<ThiMethod>(state_offset)) {
+ default:
+ thi_regs.reg_array[method] = arg;
+ switch (static_cast<ThiMethod>(method)) {
case ThiMethod::IncSyncpt: {
- LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
- const auto syncpoint_id = static_cast<u32>(data & 0xFF);
- const auto cond = static_cast<u32>((data >> 8) & 0xFF);
- if (cond == 0) {
- sync_manager->Increment(syncpoint_id);
- } else {
- sync_manager->SignalDone(
- sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
- }
+ const auto syncpoint_id = static_cast<u32>(arg & 0xFF);
+ [[maybe_unused]] const auto cond = static_cast<u32>((arg >> 8) & 0xFF);
+ LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}",
+ static_cast<u32>(current_class), syncpoint_id, cond);
+ auto& syncpoint_manager = host1x.GetSyncpointManager();
+ syncpoint_manager.IncrementGuest(syncpoint_id);
+ syncpoint_manager.IncrementHost(syncpoint_id);
break;
}
case ThiMethod::SetMethod1:
- LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
- static_cast<u32>(vic_thi_state.method_0), data);
- vic_processor->ProcessMethod(static_cast<Host1x::Vic::Method>(vic_thi_state.method_0),
- data);
+ LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+ static_cast<u32>(current_class), static_cast<u32>(thi_regs.method_0), arg);
+ ProcessMethod(thi_regs.method_0, arg);
break;
default:
break;
}
- break;
- case ChClassId::Control:
- // This device is mainly for syncpoint synchronization
- LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
- host1x_processor->ProcessMethod(static_cast<Host1x::Control::Method>(offset), data);
- break;
- default:
- UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
- break;
}
}
-void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) {
- u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
- std::memcpy(offset_ptr, &argument, sizeof(u32));
-}
-
} // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index 7d660af47..becbccef1 100644
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -3,12 +3,18 @@
#pragma once
+#include <condition_variable>
+#include <deque>
#include <memory>
+#include <mutex>
+#include <thread>
#include <vector>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
+#include "common/polyfill_thread.h"
+#include "core/memory.h"
namespace Tegra {
@@ -62,23 +68,31 @@ struct ChCommand {
std::vector<u32> arguments;
};
-using ChCommandHeaderList = std::vector<ChCommandHeader>;
+using ChCommandHeaderList =
+ Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader, Core::Memory::GuestMemoryFlags::SafeRead>;
struct ThiRegisters {
- u32_le increment_syncpt{};
- INSERT_PADDING_WORDS(1);
- u32_le increment_syncpt_error{};
- u32_le ctx_switch_incremement_syncpt{};
- INSERT_PADDING_WORDS(4);
- u32_le ctx_switch{};
- INSERT_PADDING_WORDS(1);
- u32_le ctx_syncpt_eof{};
- INSERT_PADDING_WORDS(5);
- u32_le method_0{};
- u32_le method_1{};
- INSERT_PADDING_WORDS(12);
- u32_le int_status{};
- u32_le int_mask{};
+ static constexpr std::size_t NUM_REGS = 0x20;
+
+ union {
+ struct {
+ u32_le increment_syncpt;
+ INSERT_PADDING_WORDS_NOINIT(1);
+ u32_le increment_syncpt_error;
+ u32_le ctx_switch_incremement_syncpt;
+ INSERT_PADDING_WORDS_NOINIT(4);
+ u32_le ctx_switch;
+ INSERT_PADDING_WORDS_NOINIT(1);
+ u32_le ctx_syncpt_eof;
+ INSERT_PADDING_WORDS_NOINIT(5);
+ u32_le method_0;
+ u32_le method_1;
+ INSERT_PADDING_WORDS_NOINIT(12);
+ u32_le int_status;
+ u32_le int_mask;
+ };
+ std::array<u32, NUM_REGS> reg_array;
+ };
};
enum class ThiMethod : u32 {
@@ -89,32 +103,39 @@ enum class ThiMethod : u32 {
class CDmaPusher {
public:
- explicit CDmaPusher(Host1x::Host1x& host1x);
- ~CDmaPusher();
+ CDmaPusher() = delete;
+ virtual ~CDmaPusher();
- /// Process the command entry
- void ProcessEntries(ChCommandHeaderList&& entries);
+ void PushEntries(ChCommandHeaderList&& entries) {
+ std::scoped_lock l{command_mutex};
+ command_lists.push_back(std::move(entries));
+ command_cv.notify_one();
+ }
+
+protected:
+ explicit CDmaPusher(Host1x::Host1x& host1x, s32 id);
+
+ virtual void ProcessMethod(u32 method, u32 arg) = 0;
+
+ Host1x::Host1x& host1x;
+ Tegra::MemoryManager& memory_manager;
private:
+ /// Process the command entry
+ void ProcessEntries(std::stop_token stop_token);
+
/// Invoke command class devices to execute the command based on the current state
void ExecuteCommand(u32 state_offset, u32 data);
- /// Write arguments value to the ThiRegisters member at the specified offset
- void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument);
+ std::unique_ptr<Host1x::Control> host_processor;
- Host1x::Host1x& host1x;
- std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
- std::unique_ptr<Tegra::Host1x::Vic> vic_processor;
- std::unique_ptr<Tegra::Host1x::Control> host1x_processor;
- std::unique_ptr<Host1x::SyncptIncrManager> sync_manager;
- ChClassId current_class{};
- ThiRegisters vic_thi_state{};
- ThiRegisters nvdec_thi_state{};
-
- u32 count{};
- u32 offset{};
- u32 mask{};
- bool incrementing{};
+ std::mutex command_mutex;
+ std::condition_variable_any command_cv;
+ std::deque<ChCommandHeaderList> command_lists;
+ std::jthread thread;
+
+ ThiRegisters thi_regs{};
+ ChClassId current_class;
};
} // namespace Tegra
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 6d0b32339..c816f47fe 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -250,30 +250,6 @@ struct GPU::Impl {
gpu_thread.SubmitList(channel, std::move(entries));
}
- /// Push GPU command buffer entries to be processed
- void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
- if (!use_nvdec) {
- return;
- }
-
- if (!cdma_pushers.contains(id)) {
- cdma_pushers.insert_or_assign(id, std::make_unique<Tegra::CDmaPusher>(host1x));
- }
-
- // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
- // TODO(ameerj): RE proper async nvdec operation
- // gpu_thread.SubmitCommandBuffer(std::move(entries));
- cdma_pushers[id]->ProcessEntries(std::move(entries));
- }
-
- /// Frees the CDMAPusher instance to free up resources
- void ClearCdmaInstance(u32 id) {
- const auto iter = cdma_pushers.find(id);
- if (iter != cdma_pushers.end()) {
- cdma_pushers.erase(iter);
- }
- }
-
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
void FlushRegion(DAddr addr, u64 size) {
gpu_thread.FlushRegion(addr, size);
@@ -362,7 +338,6 @@ struct GPU::Impl {
Core::System& system;
Host1x::Host1x& host1x;
- std::map<u32, std::unique_ptr<Tegra::CDmaPusher>> cdma_pushers;
std::unique_ptr<VideoCore::RendererBase> renderer;
VideoCore::RasterizerInterface* rasterizer = nullptr;
const bool use_nvdec;
@@ -556,14 +531,6 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) {
impl->PushGPUEntries(channel, std::move(entries));
}
-void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
- impl->PushCommandBuffer(id, entries);
-}
-
-void GPU::ClearCdmaInstance(u32 id) {
- impl->ClearCdmaInstance(id);
-}
-
VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) {
return impl->OnCPURead(addr, size);
}
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 50014e51f..8a06adad7 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -234,15 +234,6 @@ public:
/// Push GPU command entries to be processed
void PushGPUEntries(s32 channel, Tegra::CommandList&& entries);
- /// Push GPU command buffer entries to be processed
- void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries);
-
- /// Frees the CDMAPusher instance to free up resources
- void ClearCdmaInstance(u32 id);
-
- /// Swap buffers (render frame)
- void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
-
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
[[nodiscard]] VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size);
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 477e11457..e2bfdcd7f 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -12,6 +12,7 @@
#include "video_core/dma_pusher.h"
#include "video_core/gpu.h"
#include "video_core/gpu_thread.h"
+#include "video_core/host1x/host1x.h"
#include "video_core/renderer_base.h"
namespace VideoCommon::GPUThread {
diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp
deleted file mode 100644
index 1030db681..000000000
--- a/src/video_core/host1x/codecs/codec.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include "common/assert.h"
-#include "common/settings.h"
-#include "video_core/host1x/codecs/codec.h"
-#include "video_core/host1x/codecs/h264.h"
-#include "video_core/host1x/codecs/vp8.h"
-#include "video_core/host1x/codecs/vp9.h"
-#include "video_core/host1x/host1x.h"
-#include "video_core/memory_manager.h"
-
-namespace Tegra {
-
-Codec::Codec(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs)
- : host1x(host1x_), state{regs}, h264_decoder(std::make_unique<Decoder::H264>(host1x)),
- vp8_decoder(std::make_unique<Decoder::VP8>(host1x)),
- vp9_decoder(std::make_unique<Decoder::VP9>(host1x)) {}
-
-Codec::~Codec() = default;
-
-void Codec::Initialize() {
- initialized = decode_api.Initialize(current_codec);
-}
-
-void Codec::SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec) {
- if (current_codec != codec) {
- current_codec = codec;
- LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", GetCurrentCodecName());
- }
-}
-
-void Codec::Decode() {
- const bool is_first_frame = !initialized;
- if (is_first_frame) {
- Initialize();
- }
-
- if (!initialized) {
- return;
- }
-
- // Assemble bitstream.
- bool vp9_hidden_frame = false;
- size_t configuration_size = 0;
- const auto packet_data = [&]() {
- switch (current_codec) {
- case Tegra::Host1x::NvdecCommon::VideoCodec::H264:
- return h264_decoder->ComposeFrame(state, &configuration_size, is_first_frame);
- case Tegra::Host1x::NvdecCommon::VideoCodec::VP8:
- return vp8_decoder->ComposeFrame(state);
- case Tegra::Host1x::NvdecCommon::VideoCodec::VP9:
- vp9_decoder->ComposeFrame(state);
- vp9_hidden_frame = vp9_decoder->WasFrameHidden();
- return vp9_decoder->GetFrameBytes();
- default:
- ASSERT(false);
- return std::span<const u8>{};
- }
- }();
-
- // Send assembled bitstream to decoder.
- if (!decode_api.SendPacket(packet_data, configuration_size)) {
- return;
- }
-
- // Only receive/store visible frames.
- if (vp9_hidden_frame) {
- return;
- }
-
- // Receive output frames from decoder.
- decode_api.ReceiveFrames(frames);
-
- while (frames.size() > 10) {
- LOG_DEBUG(HW_GPU, "ReceiveFrames overflow, dropped frame");
- frames.pop();
- }
-}
-
-std::unique_ptr<FFmpeg::Frame> Codec::GetCurrentFrame() {
- // Sometimes VIC will request more frames than have been decoded.
- // in this case, return a blank frame and don't overwrite previous data.
- if (frames.empty()) {
- return {};
- }
-
- auto frame = std::move(frames.front());
- frames.pop();
- return frame;
-}
-
-Host1x::NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
- return current_codec;
-}
-
-std::string_view Codec::GetCurrentCodecName() const {
- switch (current_codec) {
- case Host1x::NvdecCommon::VideoCodec::None:
- return "None";
- case Host1x::NvdecCommon::VideoCodec::H264:
- return "H264";
- case Host1x::NvdecCommon::VideoCodec::VP8:
- return "VP8";
- case Host1x::NvdecCommon::VideoCodec::H265:
- return "H265";
- case Host1x::NvdecCommon::VideoCodec::VP9:
- return "VP9";
- default:
- return "Unknown";
- }
-}
-} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/codec.h b/src/video_core/host1x/codecs/codec.h
deleted file mode 100644
index f700ae129..000000000
--- a/src/video_core/host1x/codecs/codec.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <memory>
-#include <optional>
-#include <string_view>
-#include <queue>
-#include "common/common_types.h"
-#include "video_core/host1x/ffmpeg/ffmpeg.h"
-#include "video_core/host1x/nvdec_common.h"
-
-namespace Tegra {
-
-namespace Decoder {
-class H264;
-class VP8;
-class VP9;
-} // namespace Decoder
-
-namespace Host1x {
-class Host1x;
-} // namespace Host1x
-
-class Codec {
-public:
- explicit Codec(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs);
- ~Codec();
-
- /// Initialize the codec, returning success or failure
- void Initialize();
-
- /// Sets NVDEC video stream codec
- void SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec);
-
- /// Call decoders to construct headers, decode AVFrame with ffmpeg
- void Decode();
-
- /// Returns next decoded frame
- [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetCurrentFrame();
-
- /// Returns the value of current_codec
- [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const;
-
- /// Return name of the current codec
- [[nodiscard]] std::string_view GetCurrentCodecName() const;
-
-private:
- bool initialized{};
- Host1x::NvdecCommon::VideoCodec current_codec{Host1x::NvdecCommon::VideoCodec::None};
- FFmpeg::DecodeApi decode_api;
-
- Host1x::Host1x& host1x;
- const Host1x::NvdecCommon::NvdecRegisters& state;
- std::unique_ptr<Decoder::H264> h264_decoder;
- std::unique_ptr<Decoder::VP8> vp8_decoder;
- std::unique_ptr<Decoder::VP9> vp9_decoder;
-
- std::queue<std::unique_ptr<FFmpeg::Frame>> frames{};
-};
-
-} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp
new file mode 100644
index 000000000..49a601969
--- /dev/null
+++ b/src/video_core/host1x/codecs/decoder.cpp
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/assert.h"
+#include "common/settings.h"
+#include "video_core/host1x/codecs/decoder.h"
+#include "video_core/host1x/host1x.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+
+Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_,
+ Host1x::FrameQueue& frame_queue_)
+ : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{
+ frame_queue_} {}
+
+Decoder::~Decoder() = default;
+
+void Decoder::Decode() {
+ if (!initialized) {
+ return;
+ }
+
+ const auto packet_data = ComposeFrame();
+ // Send assembled bitstream to decoder.
+ if (!decode_api.SendPacket(packet_data)) {
+ return;
+ }
+
+ // Only receive/store visible frames.
+ if (vp9_hidden_frame) {
+ return;
+ }
+
+ // Receive output frames from decoder.
+ auto frame = decode_api.ReceiveFrame();
+
+ if (IsInterlaced()) {
+ auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = GetInterlacedOffsets();
+ auto frame_copy = frame;
+
+ if (!frame.get()) {
+ LOG_ERROR(HW_GPU,
+ "Nvdec {} dailed to decode interlaced frame for top 0x{:X} bottom 0x{:X}", id,
+ luma_top, luma_bottom);
+ }
+
+ if (UsingDecodeOrder()) {
+ frame_queue.PushDecodeOrder(id, luma_top, std::move(frame));
+ frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy));
+ } else {
+ frame_queue.PushPresentOrder(id, luma_top, std::move(frame));
+ frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy));
+ }
+ } else {
+ auto [luma_offset, chroma_offset] = GetProgressiveOffsets();
+
+ if (!frame.get()) {
+ LOG_ERROR(HW_GPU, "Nvdec {} failed to decode progressive frame for luma 0x{:X}", id,
+ luma_offset);
+ }
+
+ if (UsingDecodeOrder()) {
+ frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame));
+ } else {
+ frame_queue.PushPresentOrder(id, luma_offset, std::move(frame));
+ }
+ }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h
new file mode 100644
index 000000000..22e6db815
--- /dev/null
+++ b/src/video_core/host1x/codecs/decoder.h
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string_view>
+#include <unordered_map>
+#include <queue>
+
+#include "common/common_types.h"
+#include "video_core/host1x/ffmpeg/ffmpeg.h"
+#include "video_core/host1x/nvdec_common.h"
+
+namespace Tegra {
+
+namespace Host1x {
+class Host1x;
+class FrameQueue;
+} // namespace Host1x
+
+class Decoder {
+public:
+ virtual ~Decoder();
+
+ /// Call decoders to construct headers, decode AVFrame with ffmpeg
+ void Decode();
+
+ bool UsingDecodeOrder() const {
+ return decode_api.UsingDecodeOrder();
+ }
+
+ /// Returns the value of current_codec
+ [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const {
+ return codec;
+ }
+
+ /// Return name of the current codec
+ [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0;
+
+protected:
+ explicit Decoder(Host1x::Host1x& host1x, s32 id,
+ const Host1x::NvdecCommon::NvdecRegisters& regs,
+ Host1x::FrameQueue& frame_queue);
+
+ virtual std::span<const u8> ComposeFrame() = 0;
+ virtual std::tuple<u64, u64> GetProgressiveOffsets() = 0;
+ virtual std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() = 0;
+ virtual bool IsInterlaced() = 0;
+
+ Host1x::Host1x& host1x;
+ Tegra::MemoryManager& memory_manager;
+ const Host1x::NvdecCommon::NvdecRegisters& regs;
+ s32 id;
+ Host1x::FrameQueue& frame_queue;
+ Host1x::NvdecCommon::VideoCodec codec;
+ FFmpeg::DecodeApi decode_api;
+ bool initialized{};
+ bool vp9_hidden_frame{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index 994591c8d..782d11d72 100644
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
#include <array>
#include <bit>
@@ -10,7 +10,7 @@
#include "video_core/host1x/host1x.h"
#include "video_core/memory_manager.h"
-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
namespace {
// ZigZag LUTs from libavcodec.
constexpr std::array<u8, 64> zig_zag_direct{
@@ -25,23 +25,56 @@ constexpr std::array<u8, 16> zig_zag_scan{
};
} // Anonymous namespace
-H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+ Host1x::FrameQueue& frame_queue_)
+ : Decoder{host1x_, id_, regs_, frame_queue_} {
+ codec = Host1x::NvdecCommon::VideoCodec::H264;
+ initialized = decode_api.Initialize(codec);
+}
H264::~H264() = default;
-std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
- size_t* out_configuration_size, bool is_first_frame) {
- H264DecoderContext context;
- host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+std::tuple<u64, u64> H264::GetProgressiveOffsets() {
+ auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+ auto luma{regs.surface_luma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.luma_frame_offset.Address()};
+ auto chroma{regs.surface_chroma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.chroma_frame_offset.Address()};
+ return {luma, chroma};
+}
+
+std::tuple<u64, u64, u64, u64> H264::GetInterlacedOffsets() {
+ auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+ auto luma_top{regs.surface_luma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.luma_top_offset.Address()};
+ auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.luma_bot_offset.Address()};
+ auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.chroma_top_offset.Address()};
+ auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() +
+ current_context.h264_parameter_set.chroma_bot_offset.Address()};
+ return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+bool H264::IsInterlaced() {
+ return current_context.h264_parameter_set.luma_top_offset.Address() != 0 ||
+ current_context.h264_parameter_set.luma_bot_offset.Address() != 0;
+}
+
+std::span<const u8> H264::ComposeFrame() {
+ memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+ sizeof(H264DecoderContext));
- const s64 frame_number = context.h264_parameter_set.frame_number.Value();
+ const s64 frame_number = current_context.h264_parameter_set.frame_number.Value();
if (!is_first_frame && frame_number != 0) {
- frame.resize_destructive(context.stream_len);
- host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
- *out_configuration_size = 0;
- return frame;
+ frame_scratch.resize_destructive(current_context.stream_len);
+ memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(),
+ frame_scratch.size());
+ return frame_scratch;
}
+ is_first_frame = false;
+
// Encode header
H264BitWriter writer{};
writer.WriteU(1, 24);
@@ -53,7 +86,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
writer.WriteU(31, 8);
writer.WriteUe(0);
const u32 chroma_format_idc =
- static_cast<u32>(context.h264_parameter_set.chroma_format_idc.Value());
+ static_cast<u32>(current_context.h264_parameter_set.chroma_format_idc.Value());
writer.WriteUe(chroma_format_idc);
if (chroma_format_idc == 3) {
writer.WriteBit(false);
@@ -61,42 +94,44 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
writer.WriteUe(0);
writer.WriteUe(0);
- writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+ writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0);
writer.WriteBit(false); // Scaling matrix present flag
- writer.WriteUe(static_cast<u32>(context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
+ writer.WriteUe(
+ static_cast<u32>(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
const auto order_cnt_type =
- static_cast<u32>(context.h264_parameter_set.pic_order_cnt_type.Value());
+ static_cast<u32>(current_context.h264_parameter_set.pic_order_cnt_type.Value());
writer.WriteUe(order_cnt_type);
if (order_cnt_type == 0) {
- writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
+ writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
} else if (order_cnt_type == 1) {
- writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
writer.WriteSe(0);
writer.WriteSe(0);
writer.WriteUe(0);
}
- const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units /
- (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+ const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs /
+ (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
- // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
- const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
- const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu;
- const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
+ u32 max_num_ref_frames =
+ std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active,
+ current_context.h264_parameter_set.num_refidx_l1_default_active) +
+ 1,
+ 4);
writer.WriteUe(max_num_ref_frames);
writer.WriteBit(false);
- writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+ writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1);
writer.WriteUe(pic_height - 1);
- writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0);
- if (!context.h264_parameter_set.frame_mbs_only_flag) {
- writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
+ if (!current_context.h264_parameter_set.frame_mbs_only_flag) {
+ writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
}
- writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
+ writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
writer.WriteBit(false); // Frame cropping flag
writer.WriteBit(false); // VUI parameter present flag
@@ -111,57 +146,59 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
writer.WriteUe(0);
writer.WriteUe(0);
- writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
- writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0);
writer.WriteUe(0);
- writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
- writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
- writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0);
- writer.WriteU(static_cast<s32>(context.h264_parameter_set.weighted_bipred_idc.Value()), 2);
- s32 pic_init_qp = static_cast<s32>(context.h264_parameter_set.pic_init_qp_minus26.Value());
+ writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active);
+ writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active);
+ writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0);
+ writer.WriteU(static_cast<s32>(current_context.h264_parameter_set.weighted_bipred_idc.Value()),
+ 2);
+ s32 pic_init_qp =
+ static_cast<s32>(current_context.h264_parameter_set.pic_init_qp_minus26.Value());
writer.WriteSe(pic_init_qp);
writer.WriteSe(0);
s32 chroma_qp_index_offset =
- static_cast<s32>(context.h264_parameter_set.chroma_qp_index_offset.Value());
+ static_cast<s32>(current_context.h264_parameter_set.chroma_qp_index_offset.Value());
writer.WriteSe(chroma_qp_index_offset);
- writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
- writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
- writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
- writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
+ writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
+ writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0);
writer.WriteBit(true); // pic_scaling_matrix_present_flag
for (s32 index = 0; index < 6; index++) {
writer.WriteBit(true);
- std::span<const u8> matrix{context.weight_scale};
- writer.WriteScalingList(scan, matrix, index * 16, 16);
+ std::span<const u8> matrix{current_context.weight_scale_4x4};
+ writer.WriteScalingList(scan_scratch, matrix, index * 16, 16);
}
- if (context.h264_parameter_set.transform_8x8_mode_flag) {
+ if (current_context.h264_parameter_set.transform_8x8_mode_flag) {
for (s32 index = 0; index < 2; index++) {
writer.WriteBit(true);
- std::span<const u8> matrix{context.weight_scale_8x8};
- writer.WriteScalingList(scan, matrix, index * 64, 64);
+ std::span<const u8> matrix{current_context.weight_scale_8x8};
+ writer.WriteScalingList(scan_scratch, matrix, index * 64, 64);
}
}
s32 chroma_qp_index_offset2 =
- static_cast<s32>(context.h264_parameter_set.second_chroma_qp_index_offset.Value());
+ static_cast<s32>(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value());
writer.WriteSe(chroma_qp_index_offset2);
writer.End();
const auto& encoded_header = writer.GetByteArray();
- frame.resize(encoded_header.size() + context.stream_len);
- std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+ frame_scratch.resize(encoded_header.size() + current_context.stream_len);
+ std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size());
- *out_configuration_size = encoded_header.size();
- host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(),
- context.stream_len);
+ memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+ frame_scratch.data() + encoded_header.size(),
+ current_context.stream_len);
- return frame;
+ return frame_scratch;
}
H264BitWriter::H264BitWriter() = default;
@@ -278,4 +315,4 @@ void H264BitWriter::Flush() {
buffer = 0;
buffer_pos = 0;
}
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h
index 1deaf4632..d946c6937 100644
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
@@ -10,6 +10,7 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
#include "video_core/host1x/nvdec_common.h"
namespace Tegra {
@@ -18,7 +19,7 @@ namespace Host1x {
class Host1x;
} // namespace Host1x
-namespace Decoder {
+namespace Decoders {
class H264BitWriter {
public:
@@ -60,123 +61,213 @@ private:
std::vector<u8> byte_array;
};
-class H264 {
-public:
- explicit H264(Host1x::Host1x& host1x);
- ~H264();
-
- /// Compose the H264 frame for FFmpeg decoding
- [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
- size_t* out_configuration_size,
- bool is_first_frame = false);
+struct Offset {
+ constexpr u32 Address() const noexcept {
+ return offset << 8;
+ }
private:
- Common::ScratchBuffer<u8> frame;
- Common::ScratchBuffer<u8> scan;
- Host1x::Host1x& host1x;
-
- struct H264ParameterSet {
- s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
- s32 delta_pic_order_always_zero_flag; ///< 0x04
- s32 frame_mbs_only_flag; ///< 0x08
- u32 pic_width_in_mbs; ///< 0x0C
- u32 frame_height_in_map_units; ///< 0x10
- union { ///< 0x14
- BitField<0, 2, u32> tile_format;
- BitField<2, 3, u32> gob_height;
- };
- u32 entropy_coding_mode_flag; ///< 0x18
- s32 pic_order_present_flag; ///< 0x1C
- s32 num_refidx_l0_default_active; ///< 0x20
- s32 num_refidx_l1_default_active; ///< 0x24
- s32 deblocking_filter_control_present_flag; ///< 0x28
- s32 redundant_pic_cnt_present_flag; ///< 0x2C
- u32 transform_8x8_mode_flag; ///< 0x30
- u32 pitch_luma; ///< 0x34
- u32 pitch_chroma; ///< 0x38
- u32 luma_top_offset; ///< 0x3C
- u32 luma_bot_offset; ///< 0x40
- u32 luma_frame_offset; ///< 0x44
- u32 chroma_top_offset; ///< 0x48
- u32 chroma_bot_offset; ///< 0x4C
- u32 chroma_frame_offset; ///< 0x50
- u32 hist_buffer_size; ///< 0x54
- union { ///< 0x58
- union {
- BitField<0, 1, u64> mbaff_frame;
- BitField<1, 1, u64> direct_8x8_inference;
- BitField<2, 1, u64> weighted_pred;
- BitField<3, 1, u64> constrained_intra_pred;
- BitField<4, 1, u64> ref_pic;
- BitField<5, 1, u64> field_pic;
- BitField<6, 1, u64> bottom_field;
- BitField<7, 1, u64> second_field;
- } flags;
- BitField<8, 4, u64> log2_max_frame_num_minus4;
- BitField<12, 2, u64> chroma_format_idc;
- BitField<14, 2, u64> pic_order_cnt_type;
- BitField<16, 6, s64> pic_init_qp_minus26;
- BitField<22, 5, s64> chroma_qp_index_offset;
- BitField<27, 5, s64> second_chroma_qp_index_offset;
- BitField<32, 2, u64> weighted_bipred_idc;
- BitField<34, 7, u64> curr_pic_idx;
- BitField<41, 5, u64> curr_col_idx;
- BitField<46, 16, u64> frame_number;
- BitField<62, 1, u64> frame_surfaces;
- BitField<63, 1, u64> output_memory_layout;
- };
+ u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
+
+struct H264ParameterSet {
+ s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
+ s32 delta_pic_order_always_zero_flag; ///< 0x04
+ s32 frame_mbs_only_flag; ///< 0x08
+ u32 pic_width_in_mbs; ///< 0x0C
+ u32 frame_height_in_mbs; ///< 0x10
+ union { ///< 0x14
+ BitField<0, 2, u32> tile_format;
+ BitField<2, 3, u32> gob_height;
+ BitField<5, 27, u32> reserved_surface_format;
};
- static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
-
- struct H264DecoderContext {
- INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000
- u32 stream_len; ///< 0x0048
- INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C
- H264ParameterSet h264_parameter_set; ///< 0x0058
- INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8
- std::array<u8, 0x60> weight_scale; ///< 0x01C0
- std::array<u8, 0x80> weight_scale_8x8; ///< 0x0220
+ u32 entropy_coding_mode_flag; ///< 0x18
+ s32 pic_order_present_flag; ///< 0x1C
+ s32 num_refidx_l0_default_active; ///< 0x20
+ s32 num_refidx_l1_default_active; ///< 0x24
+ s32 deblocking_filter_control_present_flag; ///< 0x28
+ s32 redundant_pic_cnt_present_flag; ///< 0x2C
+ u32 transform_8x8_mode_flag; ///< 0x30
+ u32 pitch_luma; ///< 0x34
+ u32 pitch_chroma; ///< 0x38
+ Offset luma_top_offset; ///< 0x3C
+ Offset luma_bot_offset; ///< 0x40
+ Offset luma_frame_offset; ///< 0x44
+ Offset chroma_top_offset; ///< 0x48
+ Offset chroma_bot_offset; ///< 0x4C
+ Offset chroma_frame_offset; ///< 0x50
+ u32 hist_buffer_size; ///< 0x54
+ union { ///< 0x58
+ union {
+ BitField<0, 1, u64> mbaff_frame;
+ BitField<1, 1, u64> direct_8x8_inference;
+ BitField<2, 1, u64> weighted_pred;
+ BitField<3, 1, u64> constrained_intra_pred;
+ BitField<4, 1, u64> ref_pic;
+ BitField<5, 1, u64> field_pic;
+ BitField<6, 1, u64> bottom_field;
+ BitField<7, 1, u64> second_field;
+ } flags;
+ BitField<8, 4, u64> log2_max_frame_num_minus4;
+ BitField<12, 2, u64> chroma_format_idc;
+ BitField<14, 2, u64> pic_order_cnt_type;
+ BitField<16, 6, s64> pic_init_qp_minus26;
+ BitField<22, 5, s64> chroma_qp_index_offset;
+ BitField<27, 5, s64> second_chroma_qp_index_offset;
+ BitField<32, 2, u64> weighted_bipred_idc;
+ BitField<34, 7, u64> curr_pic_idx;
+ BitField<41, 5, u64> curr_col_idx;
+ BitField<46, 16, u64> frame_number;
+ BitField<62, 1, u64> frame_surfaces;
+ BitField<63, 1, u64> output_memory_layout;
};
- static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size");
+};
+static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
#define ASSERT_POSITION(field_name, position) \
static_assert(offsetof(H264ParameterSet, field_name) == position, \
"Field " #field_name " has invalid position")
- ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
- ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
- ASSERT_POSITION(frame_mbs_only_flag, 0x08);
- ASSERT_POSITION(pic_width_in_mbs, 0x0C);
- ASSERT_POSITION(frame_height_in_map_units, 0x10);
- ASSERT_POSITION(tile_format, 0x14);
- ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
- ASSERT_POSITION(pic_order_present_flag, 0x1C);
- ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
- ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
- ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
- ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
- ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
- ASSERT_POSITION(pitch_luma, 0x34);
- ASSERT_POSITION(pitch_chroma, 0x38);
- ASSERT_POSITION(luma_top_offset, 0x3C);
- ASSERT_POSITION(luma_bot_offset, 0x40);
- ASSERT_POSITION(luma_frame_offset, 0x44);
- ASSERT_POSITION(chroma_top_offset, 0x48);
- ASSERT_POSITION(chroma_bot_offset, 0x4C);
- ASSERT_POSITION(chroma_frame_offset, 0x50);
- ASSERT_POSITION(hist_buffer_size, 0x54);
- ASSERT_POSITION(flags, 0x58);
+ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
+ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
+ASSERT_POSITION(frame_mbs_only_flag, 0x08);
+ASSERT_POSITION(pic_width_in_mbs, 0x0C);
+ASSERT_POSITION(frame_height_in_mbs, 0x10);
+ASSERT_POSITION(tile_format, 0x14);
+ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
+ASSERT_POSITION(pic_order_present_flag, 0x1C);
+ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
+ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
+ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
+ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
+ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
+ASSERT_POSITION(pitch_luma, 0x34);
+ASSERT_POSITION(pitch_chroma, 0x38);
+ASSERT_POSITION(luma_top_offset, 0x3C);
+ASSERT_POSITION(luma_bot_offset, 0x40);
+ASSERT_POSITION(luma_frame_offset, 0x44);
+ASSERT_POSITION(chroma_top_offset, 0x48);
+ASSERT_POSITION(chroma_bot_offset, 0x4C);
+ASSERT_POSITION(chroma_frame_offset, 0x50);
+ASSERT_POSITION(hist_buffer_size, 0x54);
+ASSERT_POSITION(flags, 0x58);
#undef ASSERT_POSITION
+struct DpbEntry {
+ union {
+ BitField<0, 7, u32> index;
+ BitField<7, 5, u32> col_idx;
+ BitField<12, 2, u32> state;
+ BitField<14, 1, u32> is_long_term;
+ BitField<15, 1, u32> non_existing;
+ BitField<16, 1, u32> is_field;
+ BitField<17, 4, u32> top_field_marking;
+ BitField<21, 4, u32> bottom_field_marking;
+ BitField<25, 1, u32> output_memory_layout;
+ BitField<26, 6, u32> reserved;
+ } flags;
+ std::array<u32, 2> field_order_cnt;
+ u32 frame_idx;
+};
+static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!");
+
+struct DisplayParam {
+ union {
+ BitField<0, 1, u32> enable_tf_output;
+ BitField<1, 1, u32> vc1_map_y_flag;
+ BitField<2, 3, u32> map_y_value;
+ BitField<5, 1, u32> vc1_map_uv_flag;
+ BitField<6, 3, u32> map_uv_value;
+ BitField<9, 8, u32> out_stride;
+ BitField<17, 3, u32> tiling_format;
+ BitField<20, 1, u32> output_structure; // 0=frame, 1=field
+ BitField<21, 11, u32> reserved0;
+ };
+ std::array<s32, 2> output_top;
+ std::array<s32, 2> output_bottom;
+ union {
+ BitField<0, 1, u32> enable_histogram;
+ BitField<1, 12, u32> histogram_start_x;
+ BitField<13, 12, u32> histogram_start_y;
+ BitField<25, 7, u32> reserved1;
+ };
+ union {
+ BitField<0, 12, u32> histogram_end_x;
+ BitField<12, 12, u32> histogram_end_y;
+ BitField<24, 8, u32> reserved2;
+ };
+};
+static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!");
+
+struct H264DecoderContext {
+ INSERT_PADDING_WORDS_NOINIT(13); ///< 0x0000
+ std::array<u8, 16> eos; ///< 0x0034
+ u8 explicit_eos_present_flag; ///< 0x0044
+ u8 hint_dump_en; ///< 0x0045
+ INSERT_PADDING_BYTES_NOINIT(2); ///< 0x0046
+ u32 stream_len; ///< 0x0048
+ u32 slice_count; ///< 0x004C
+ u32 mbhist_buffer_size; ///< 0x0050
+ u32 gptimer_timeout_value; ///< 0x0054
+ H264ParameterSet h264_parameter_set; ///< 0x0058
+ std::array<s32, 2> curr_field_order_cnt; ///< 0x00B8
+ std::array<DpbEntry, 16> dpb; ///< 0x00C0
+ std::array<u8, 0x60> weight_scale_4x4; ///< 0x01C0
+ std::array<u8, 0x80> weight_scale_8x8; ///< 0x0220
+ std::array<u8, 2> num_inter_view_refs_lX; ///< 0x02A0
+ std::array<u8, 14> reserved2; ///< 0x02A2
+ std::array<std::array<s8, 16>, 2> inter_view_refidx_lX; ///< 0x02B0
+ union { ///< 0x02D0
+ BitField<0, 1, u32> lossless_ipred8x8_filter_enable;
+ BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag;
+ BitField<2, 30, u32> reserved3;
+ };
+ DisplayParam display_param; ///< 0x02D4
+ std::array<u32, 3> reserved4; ///< 0x02F0
+};
+static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size");
+
#define ASSERT_POSITION(field_name, position) \
static_assert(offsetof(H264DecoderContext, field_name) == position, \
"Field " #field_name " has invalid position")
- ASSERT_POSITION(stream_len, 0x48);
- ASSERT_POSITION(h264_parameter_set, 0x58);
- ASSERT_POSITION(weight_scale, 0x1C0);
+ASSERT_POSITION(stream_len, 0x48);
+ASSERT_POSITION(h264_parameter_set, 0x58);
+ASSERT_POSITION(dpb, 0xC0);
+ASSERT_POSITION(weight_scale_4x4, 0x1C0);
#undef ASSERT_POSITION
+
+class H264 final : public Decoder {
+public:
+ explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+ Host1x::FrameQueue& frame_queue);
+ ~H264() override;
+
+ H264(const H264&) = delete;
+ H264& operator=(const H264&) = delete;
+
+ H264(H264&&) = delete;
+ H264& operator=(H264&&) = delete;
+
+ /// Compose the H264 frame for FFmpeg decoding
+ [[nodiscard]] std::span<const u8> ComposeFrame() override;
+
+ std::tuple<u64, u64> GetProgressiveOffsets() override;
+ std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+ bool IsInterlaced() override;
+
+ std::string_view GetCurrentCodecName() const override {
+ return "H264";
+ }
+
+private:
+ bool is_first_frame{true};
+ Common::ScratchBuffer<u8> frame_scratch;
+ Common::ScratchBuffer<u8> scan_scratch;
+ H264DecoderContext current_context{};
};
-} // namespace Decoder
+} // namespace Decoders
} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp
index be97e3b00..6094f16e0 100644
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -7,47 +7,70 @@
#include "video_core/host1x/host1x.h"
#include "video_core/memory_manager.h"
-namespace Tegra::Decoder {
-VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+namespace Tegra::Decoders {
+VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+ Host1x::FrameQueue& frame_queue_)
+ : Decoder{host1x_, id_, regs_, frame_queue_} {
+ codec = Host1x::NvdecCommon::VideoCodec::VP8;
+ initialized = decode_api.Initialize(codec);
+}
VP8::~VP8() = default;
-std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
- VP8PictureInfo info;
- host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo));
+std::tuple<u64, u64> VP8::GetProgressiveOffsets() {
+ auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ return {luma, chroma};
+}
+
+std::tuple<u64, u64, u64, u64> VP8::GetInterlacedOffsets() {
+ auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ auto luma_bottom{
+ regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ auto chroma_top{
+ regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ auto chroma_bottom{
+ regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+ return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP8::ComposeFrame() {
+ memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+ sizeof(VP8PictureInfo));
- const bool is_key_frame = info.key_frame == 1u;
- const auto bitstream_size = static_cast<size_t>(info.vld_buffer_size);
+ const bool is_key_frame = current_context.key_frame == 1u;
+ const auto bitstream_size = static_cast<size_t>(current_context.vld_buffer_size);
const size_t header_size = is_key_frame ? 10u : 3u;
- frame.resize(header_size + bitstream_size);
+ frame_scratch.resize(header_size + bitstream_size);
// Based on page 30 of the VP8 specification.
// https://datatracker.ietf.org/doc/rfc6386/
- frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
- frame[0] |= static_cast<u8>((info.version & 7u) << 1u); // 3-bit version number
- frame[0] |= static_cast<u8>(1u << 4u); // 1-bit show_frame flag
+ frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
+ frame_scratch[0] |=
+ static_cast<u8>((current_context.version & 7u) << 1u); // 3-bit version number
+ frame_scratch[0] |= static_cast<u8>(1u << 4u); // 1-bit show_frame flag
// The next 19-bits are the first partition size
- frame[0] |= static_cast<u8>((info.first_part_size & 7u) << 5u);
- frame[1] = static_cast<u8>((info.first_part_size & 0x7f8u) >> 3u);
- frame[2] = static_cast<u8>((info.first_part_size & 0x7f800u) >> 11u);
+ frame_scratch[0] |= static_cast<u8>((current_context.first_part_size & 7u) << 5u);
+ frame_scratch[1] = static_cast<u8>((current_context.first_part_size & 0x7f8u) >> 3u);
+ frame_scratch[2] = static_cast<u8>((current_context.first_part_size & 0x7f800u) >> 11u);
if (is_key_frame) {
- frame[3] = 0x9du;
- frame[4] = 0x01u;
- frame[5] = 0x2au;
+ frame_scratch[3] = 0x9du;
+ frame_scratch[4] = 0x01u;
+ frame_scratch[5] = 0x2au;
// TODO(ameerj): Horizontal/Vertical Scale
// 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits)
- frame[6] = static_cast<u8>(info.frame_width & 0xff);
- frame[7] = static_cast<u8>(((info.frame_width >> 8) & 0x3f));
+ frame_scratch[6] = static_cast<u8>(current_context.frame_width & 0xff);
+ frame_scratch[7] = static_cast<u8>(((current_context.frame_width >> 8) & 0x3f));
// 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits)
- frame[8] = static_cast<u8>(info.frame_height & 0xff);
- frame[9] = static_cast<u8>(((info.frame_height >> 8) & 0x3f));
+ frame_scratch[8] = static_cast<u8>(current_context.frame_height & 0xff);
+ frame_scratch[9] = static_cast<u8>(((current_context.frame_height >> 8) & 0x3f));
}
- const u64 bitstream_offset = state.frame_bitstream_offset;
- host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size);
+ const u64 bitstream_offset = regs.frame_bitstream_offset.Address();
+ memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size);
- return frame;
+ return frame_scratch;
}
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h
index 5945e4658..74800281d 100644
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -9,6 +9,7 @@
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
#include "video_core/host1x/nvdec_common.h"
namespace Tegra {
@@ -17,20 +18,41 @@ namespace Host1x {
class Host1x;
} // namespace Host1x
-namespace Decoder {
+namespace Decoders {
+enum class Vp8SurfaceIndex : u32 {
+ Last = 0,
+ Golden = 1,
+ AltRef = 2,
+ Current = 3,
+};
-class VP8 {
+class VP8 final : public Decoder {
public:
- explicit VP8(Host1x::Host1x& host1x);
- ~VP8();
+ explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+ Host1x::FrameQueue& frame_queue);
+ ~VP8() override;
+
+ VP8(const VP8&) = delete;
+ VP8& operator=(const VP8&) = delete;
+
+ VP8(VP8&&) = delete;
+ VP8& operator=(VP8&&) = delete;
+
+ [[nodiscard]] std::span<const u8> ComposeFrame() override;
- /// Compose the VP8 frame for FFmpeg decoding
- [[nodiscard]] std::span<const u8> ComposeFrame(
- const Host1x::NvdecCommon::NvdecRegisters& state);
+ std::tuple<u64, u64> GetProgressiveOffsets() override;
+ std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+ bool IsInterlaced() override {
+ return false;
+ }
+
+ std::string_view GetCurrentCodecName() const override {
+ return "VP8";
+ }
private:
- Common::ScratchBuffer<u8> frame;
- Host1x::Host1x& host1x;
+ Common::ScratchBuffer<u8> frame_scratch;
struct VP8PictureInfo {
INSERT_PADDING_WORDS_NOINIT(14);
@@ -73,7 +95,9 @@ private:
INSERT_PADDING_WORDS_NOINIT(3);
};
static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size");
+
+ VP8PictureInfo current_context{};
};
-} // namespace Decoder
+} // namespace Decoders
} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp
index 65d6fb2d5..c70d0a506 100644
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -4,12 +4,13 @@
#include <algorithm> // for std::copy
#include <numeric>
+#include "common/alignment.h"
#include "common/assert.h"
#include "video_core/host1x/codecs/vp9.h"
#include "video_core/host1x/host1x.h"
#include "video_core/memory_manager.h"
-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
namespace {
constexpr u32 diff_update_probability = 252;
constexpr u32 frame_sync_code = 0x498342;
@@ -237,7 +238,12 @@ constexpr std::array<u8, 254> map_lut{
}
} // Anonymous namespace
-VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+ Host1x::FrameQueue& frame_queue_)
+ : Decoder{host1x_, id_, regs_, frame_queue_} {
+ codec = Host1x::NvdecCommon::VideoCodec::VP9;
+ initialized = decode_api.Initialize(codec);
+}
VP9::~VP9() = default;
@@ -356,35 +362,113 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_
}
}
-Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) {
- PictureInfo picture_info;
- host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
- Vp9PictureInfo vp9_info = picture_info.Convert();
+void VP9::WriteSegmentation(VpxBitStreamWriter& writer) {
+ bool enabled = current_picture_info.segmentation.enabled != 0;
+ writer.WriteBit(enabled);
+ if (!enabled) {
+ return;
+ }
+
+ auto update_map = current_picture_info.segmentation.update_map != 0;
+ writer.WriteBit(update_map);
+
+ if (update_map) {
+ EntropyProbs entropy_probs{};
+ memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs,
+ sizeof(entropy_probs));
+
+ auto WriteProb = [&](u8 prob) {
+ bool coded = prob != 255;
+ writer.WriteBit(coded);
+ if (coded) {
+ writer.WriteU(prob, 8);
+ }
+ };
+
+ for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) {
+ WriteProb(entropy_probs.mb_segment_tree_probs[i]);
+ }
+
+ auto temporal_update = current_picture_info.segmentation.temporal_update != 0;
+ writer.WriteBit(temporal_update);
+
+ if (temporal_update) {
+ for (s32 i = 0; i < 3; i++) {
+ WriteProb(entropy_probs.segment_pred_probs[i]);
+ }
+ }
+ }
+
+ if (last_segmentation == current_picture_info.segmentation) {
+ writer.WriteBit(false);
+ return;
+ }
+
+ last_segmentation = current_picture_info.segmentation;
+ writer.WriteBit(true);
+ writer.WriteBit(current_picture_info.segmentation.abs_delta != 0);
+
+ constexpr s32 MAX_SEGMENTS = 8;
+ constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0};
+
+ for (s32 i = 0; i < MAX_SEGMENTS; i++) {
+ auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0;
+ writer.WriteBit(q_enabled);
+ if (q_enabled) {
+ writer.WriteS(current_picture_info.segmentation.feature_data[i][0],
+ SegmentationFeatureBits[0]);
+ }
+
+ auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0;
+ writer.WriteBit(lf_enabled);
+ if (lf_enabled) {
+ writer.WriteS(current_picture_info.segmentation.feature_data[i][1],
+ SegmentationFeatureBits[1]);
+ }
+
+ auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0;
+ writer.WriteBit(ref_enabled);
+ if (ref_enabled) {
+ writer.WriteU(current_picture_info.segmentation.feature_data[i][2],
+ SegmentationFeatureBits[2]);
+ }
+
+ auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0;
+ writer.WriteBit(skip_enabled);
+ }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo() {
+ memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_picture_info,
+ sizeof(PictureInfo));
+ Vp9PictureInfo vp9_info = current_picture_info.Convert();
- InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+ InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy);
// surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
// order: last, golden, altref, current.
- std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
- vp9_info.frame_offsets.begin());
+ for (size_t i = 0; i < 4; i++) {
+ vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address();
+ }
return vp9_info;
}
void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
EntropyProbs entropy;
- host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+ memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs));
entropy.Convert(dst);
}
-Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+Vp9FrameContainer VP9::GetCurrentFrame() {
Vp9FrameContainer current_frame{};
{
// gpu.SyncGuestHost(); epic, why?
- current_frame.info = GetVp9PictureInfo(state);
+ current_frame.info = GetVp9PictureInfo();
current_frame.bit_stream.resize(current_frame.info.bitstream_size);
- host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
- current_frame.info.bitstream_size);
+ memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+ current_frame.bit_stream.data(),
+ current_frame.info.bitstream_size);
}
if (!next_frame.bit_stream.empty()) {
Vp9FrameContainer temp{
@@ -742,8 +826,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
- ASSERT(!current_frame_info.segment_enabled);
- uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+ WriteSegmentation(uncomp_writer);
const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
@@ -770,10 +853,29 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
return uncomp_writer;
}
-void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+std::tuple<u64, u64> VP9::GetProgressiveOffsets() {
+ auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ return {luma, chroma};
+}
+
+std::tuple<u64, u64, u64, u64> VP9::GetInterlacedOffsets() {
+ auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ auto luma_bottom{
+ regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ auto chroma_top{
+ regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ auto chroma_bottom{
+ regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+ return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP9::ComposeFrame() {
+ vp9_hidden_frame = false;
+
std::vector<u8> bitstream;
{
- Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+ Vp9FrameContainer curr_frame = GetCurrentFrame();
current_frame_info = curr_frame.info;
bitstream = std::move(curr_frame.bit_stream);
}
@@ -786,12 +888,16 @@ void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
// Write headers and frame to buffer
- frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
- std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin());
+ frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+ std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin());
std::copy(compressed_header.begin(), compressed_header.end(),
- frame.begin() + uncompressed_header.size());
+ frame_scratch.begin() + uncompressed_header.size());
std::copy(bitstream.begin(), bitstream.end(),
- frame.begin() + uncompressed_header.size() + compressed_header.size());
+ frame_scratch.begin() + uncompressed_header.size() + compressed_header.size());
+
+ vp9_hidden_frame = WasFrameHidden();
+
+ return GetFrameBytes();
}
VpxRangeEncoder::VpxRangeEncoder() {
@@ -944,4 +1050,4 @@ const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
return byte_array;
}
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h
index f1ed19508..9d42033cb 100644
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -10,6 +10,7 @@
#include "common/common_types.h"
#include "common/scratch_buffer.h"
#include "common/stream.h"
+#include "video_core/host1x/codecs/decoder.h"
#include "video_core/host1x/codecs/vp9_types.h"
#include "video_core/host1x/nvdec_common.h"
@@ -19,7 +20,7 @@ namespace Host1x {
class Host1x;
} // namespace Host1x
-namespace Decoder {
+namespace Decoders {
/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
/// VP9 header bitstreams.
@@ -110,21 +111,32 @@ private:
std::vector<u8> byte_array;
};
-class VP9 {
+class VP9 final : public Decoder {
public:
- explicit VP9(Host1x::Host1x& host1x);
- ~VP9();
+ explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+ Host1x::FrameQueue& frame_queue);
+ ~VP9() override;
VP9(const VP9&) = delete;
VP9& operator=(const VP9&) = delete;
- VP9(VP9&&) = default;
+ VP9(VP9&&) = delete;
VP9& operator=(VP9&&) = delete;
- /// Composes the VP9 frame from the GPU state information.
- /// Based on the official VP9 spec documentation
- void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state);
+ [[nodiscard]] std::span<const u8> ComposeFrame() override;
+ std::tuple<u64, u64> GetProgressiveOffsets() override;
+ std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+ bool IsInterlaced() override {
+ return false;
+ }
+
+ std::string_view GetCurrentCodecName() const override {
+ return "VP9";
+ }
+
+private:
/// Returns true if the most recent frame was a hidden frame.
[[nodiscard]] bool WasFrameHidden() const {
return !current_frame_info.show_frame;
@@ -132,10 +144,9 @@ public:
/// Returns a const span to the composed frame data.
[[nodiscard]] std::span<const u8> GetFrameBytes() const {
- return frame;
+ return frame_scratch;
}
-private:
/// Generates compressed header probability updates in the bitstream writer
template <typename T, std::size_t N>
void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
@@ -167,23 +178,22 @@ private:
/// Write motion vector probability updates. 6.3.17 in the spec
void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+ void WriteSegmentation(VpxBitStreamWriter& writer);
+
/// Returns VP9 information from NVDEC provided offset and size
- [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(
- const Host1x::NvdecCommon::NvdecRegisters& state);
+ [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo();
/// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
/// Returns frame to be decoded after buffering
- [[nodiscard]] Vp9FrameContainer GetCurrentFrame(
- const Host1x::NvdecCommon::NvdecRegisters& state);
+ [[nodiscard]] Vp9FrameContainer GetCurrentFrame();
/// Use NVDEC providied information to compose the headers for the current frame
[[nodiscard]] std::vector<u8> ComposeCompressedHeader();
[[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
- Host1x::Host1x& host1x;
- Common::ScratchBuffer<u8> frame;
+ Common::ScratchBuffer<u8> frame_scratch;
std::array<s8, 4> loop_filter_ref_deltas{};
std::array<s8, 2> loop_filter_mode_deltas{};
@@ -192,9 +202,11 @@ private:
std::array<Vp9EntropyProbs, 4> frame_ctxs{};
bool swap_ref_indices{};
+ Segmentation last_segmentation{};
+ PictureInfo current_picture_info{};
Vp9PictureInfo current_frame_info{};
Vp9EntropyProbs prev_frame_probs{};
};
-} // namespace Decoder
+} // namespace Decoders
} // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h
index cc9b25690..77535d5f6 100644
--- a/src/video_core/host1x/codecs/vp9_types.h
+++ b/src/video_core/host1x/codecs/vp9_types.h
@@ -11,7 +11,14 @@
namespace Tegra {
-namespace Decoder {
+namespace Decoders {
+enum class Vp9SurfaceIndex : u32 {
+ Last = 0,
+ Golden = 1,
+ AltRef = 2,
+ Current = 3,
+};
+
struct Vp9FrameDimensions {
s16 width;
s16 height;
@@ -48,11 +55,13 @@ enum class TxMode {
};
struct Segmentation {
+ constexpr bool operator==(const Segmentation& rhs) const = default;
+
u8 enabled;
u8 update_map;
u8 temporal_update;
u8 abs_delta;
- std::array<u32, 8> feature_mask;
+ std::array<std::array<u8, 4>, 8> feature_enabled;
std::array<std::array<s16, 4>, 8> feature_data;
};
static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
@@ -190,7 +199,17 @@ struct PictureInfo {
static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
struct EntropyProbs {
- INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000
+ std::array<u8, 10 * 10 * 8> kf_bmode_prob; ///< 0x0000
+ std::array<u8, 10 * 10 * 1> kf_bmode_probB; ///< 0x0320
+ std::array<u8, 3> ref_pred_probs; ///< 0x0384
+ std::array<u8, 7> mb_segment_tree_probs; ///< 0x0387
+ std::array<u8, 3> segment_pred_probs; ///< 0x038E
+ std::array<u8, 4> ref_scores; ///< 0x0391
+ std::array<u8, 2> prob_comppred; ///< 0x0395
+ INSERT_PADDING_BYTES_NOINIT(9); ///< 0x0397
+ std::array<u8, 10 * 8> kf_uv_mode_prob; ///< 0x03A0
+ std::array<u8, 10 * 1> kf_uv_mode_probB; ///< 0x03F0
+ INSERT_PADDING_BYTES_NOINIT(6); ///< 0x03FA
std::array<u8, 28> inter_mode_prob; ///< 0x0400
std::array<u8, 4> intra_inter_prob; ///< 0x041C
INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420
@@ -302,5 +321,5 @@ ASSERT_POSITION(class_0_fr, 0x560);
ASSERT_POSITION(coef_probs, 0x5A0);
#undef ASSERT_POSITION
-}; // namespace Decoder
+}; // namespace Decoders
}; // namespace Tegra
diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp
index dceefdb7f..bd0ce9160 100644
--- a/src/video_core/host1x/control.cpp
+++ b/src/video_core/host1x/control.cpp
@@ -27,6 +27,7 @@ void Control::ProcessMethod(Method method, u32 argument) {
}
void Control::Execute(u32 data) {
+ LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value);
host1x.GetSyncpointManager().WaitHost(data, syncpoint_value);
}
diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h
index e117888a3..bd8a2d7ad 100644
--- a/src/video_core/host1x/control.h
+++ b/src/video_core/host1x/control.h
@@ -6,9 +6,7 @@
#include "common/common_types.h"
-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {
class Host1x;
class Nvdec;
@@ -31,10 +29,8 @@ private:
/// For Host1x, execute is waiting on a syncpoint previously written into the state
void Execute(u32 data);
- u32 syncpoint_value{};
Host1x& host1x;
+ u32 syncpoint_value{};
};
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
index 1003cd38d..d603bad8b 100644
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@@ -5,7 +5,9 @@
#include "common/logging/log.h"
#include "common/scope_exit.h"
#include "common/settings.h"
+#include "core/memory.h"
#include "video_core/host1x/ffmpeg/ffmpeg.h"
+#include "video_core/memory_manager.h"
extern "C" {
#ifdef LIBVA_FOUND
@@ -149,6 +151,7 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context,
}
}
+ LOG_INFO(HW_GPU, "Hardware decoding is disabled due to implementation issues, using CPU.");
return false;
}
@@ -183,8 +186,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) {
return true;
}
-DecoderContext::DecoderContext(const Decoder& decoder) {
- m_codec_context = avcodec_alloc_context3(decoder.GetCodec());
+DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} {
+ m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec());
av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
m_codec_context->thread_count = 0;
m_codec_context->thread_type &= ~FF_THREAD_FRAME;
@@ -216,6 +219,25 @@ bool DecoderContext::OpenContext(const Decoder& decoder) {
}
bool DecoderContext::SendPacket(const Packet& packet) {
+ m_temp_frame = std::make_shared<Frame>();
+ m_got_frame = 0;
+
+// Android can randomly crash when calling decode directly, so skip.
+// TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+ if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+ m_decode_order = true;
+ auto* codec{ffcodec(m_decoder.GetCodec())};
+ if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(),
+ &m_got_frame, packet.GetPacket());
+ ret < 0) {
+ LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret));
+ return false;
+ }
+ return true;
+ }
+#endif
+
if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
return false;
@@ -224,139 +246,73 @@ bool DecoderContext::SendPacket(const Packet& packet) {
return true;
}
-std::unique_ptr<Frame> DecoderContext::ReceiveFrame(bool* out_is_interlaced) {
- auto dst_frame = std::make_unique<Frame>();
-
- const auto ReceiveImpl = [&](AVFrame* frame) {
- if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
- LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
- return false;
+std::shared_ptr<Frame> DecoderContext::ReceiveFrame() {
+ // Android can randomly crash when calling decode directly, so skip.
+ // TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+ if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+ m_decode_order = true;
+ auto* codec{ffcodec(m_decoder.GetCodec())};
+ int ret{0};
+
+ if (m_got_frame == 0) {
+ Packet packet{{}};
+ auto* pkt = packet.GetPacket();
+ pkt->data = nullptr;
+ pkt->size = 0;
+ ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt);
+ m_codec_context->has_b_frames = 0;
}
- *out_is_interlaced =
-#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
- (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0;
-#else
- frame->interlaced_frame != 0;
-#endif
- return true;
- };
-
- if (m_codec_context->hw_device_ctx) {
- // If we have a hardware context, make a separate frame here to receive the
- // hardware result before sending it to the output.
- Frame intermediate_frame;
-
- if (!ReceiveImpl(intermediate_frame.GetFrame())) {
+ if (m_got_frame == 0 || ret < 0) {
+ LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret);
return {};
}
+ } else
+#endif
+ {
- dst_frame->SetFormat(PreferredGpuFormat);
- if (const int ret =
- av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0);
- ret < 0) {
- LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
- return {};
- }
- } else {
- // Otherwise, decode the frame as normal.
- if (!ReceiveImpl(dst_frame->GetFrame())) {
- return {};
- }
- }
-
- return dst_frame;
-}
-
-DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) {
- const AVFilter* buffer_src = avfilter_get_by_name("buffer");
- const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
- AVFilterInOut* inputs = avfilter_inout_alloc();
- AVFilterInOut* outputs = avfilter_inout_alloc();
- SCOPE_EXIT {
- avfilter_inout_free(&inputs);
- avfilter_inout_free(&outputs);
- };
-
- // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
- // so just use 1/1 to make buffer filter happy
- std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(),
- frame.GetHeight(), static_cast<int>(frame.GetPixelFormat()));
-
- m_filter_graph = avfilter_graph_alloc();
- int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(),
- nullptr, m_filter_graph);
- if (ret < 0) {
- LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret));
- return;
- }
-
- ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr,
- m_filter_graph);
- if (ret < 0) {
- LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret));
- return;
- }
-
- inputs->name = av_strdup("out");
- inputs->filter_ctx = m_sink_context;
- inputs->pad_idx = 0;
- inputs->next = nullptr;
-
- outputs->name = av_strdup("in");
- outputs->filter_ctx = m_source_context;
- outputs->pad_idx = 0;
- outputs->next = nullptr;
-
- const char* description = "yadif=1:-1:0";
- ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr);
- if (ret < 0) {
- LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret));
- return;
- }
-
- ret = avfilter_graph_config(m_filter_graph, nullptr);
- if (ret < 0) {
- LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret));
- return;
- }
-
- m_initialized = true;
-}
-
-bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) {
- if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(),
- AV_BUFFERSRC_FLAG_KEEP_REF);
- ret < 0) {
- LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret));
- return false;
- }
+ const auto ReceiveImpl = [&](AVFrame* frame) {
+ if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+ LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
+ return false;
+ }
- return true;
-}
+ return true;
+ };
-std::unique_ptr<Frame> DeinterlaceFilter::DrainSinkFrame() {
- auto dst_frame = std::make_unique<Frame>();
- const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame());
+ if (m_codec_context->hw_device_ctx) {
+ // If we have a hardware context, make a separate frame here to receive the
+ // hardware result before sending it to the output.
+ Frame intermediate_frame;
- if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) {
- return {};
- }
+ if (!ReceiveImpl(intermediate_frame.GetFrame())) {
+ return {};
+ }
- if (ret < 0) {
- LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret));
- return {};
+ m_temp_frame->SetFormat(PreferredGpuFormat);
+ if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(),
+ intermediate_frame.GetFrame(), 0);
+ ret < 0) {
+ LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+ return {};
+ }
+ } else {
+ // Otherwise, decode the frame as normal.
+ if (!ReceiveImpl(m_temp_frame->GetFrame())) {
+ return {};
+ }
+ }
}
- return dst_frame;
-}
-
-DeinterlaceFilter::~DeinterlaceFilter() {
- avfilter_graph_free(&m_filter_graph);
+#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
+ m_temp_frame->GetFrame()->interlaced_frame =
+ (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0;
+#endif
+ return std::move(m_temp_frame);
}
void DecodeApi::Reset() {
- m_deinterlace_filter.reset();
m_hardware_context.reset();
m_decoder_context.reset();
m_decoder.reset();
@@ -382,43 +338,14 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
return true;
}
-bool DecodeApi::SendPacket(std::span<const u8> packet_data, size_t configuration_size) {
+bool DecodeApi::SendPacket(std::span<const u8> packet_data) {
FFmpeg::Packet packet(packet_data);
return m_decoder_context->SendPacket(packet);
}
-void DecodeApi::ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue) {
+std::shared_ptr<Frame> DecodeApi::ReceiveFrame() {
// Receive raw frame from decoder.
- bool is_interlaced;
- auto frame = m_decoder_context->ReceiveFrame(&is_interlaced);
- if (!frame) {
- return;
- }
-
- if (!is_interlaced) {
- // If the frame is not interlaced, we can pend it now.
- frame_queue.push(std::move(frame));
- } else {
- // Create the deinterlacer if needed.
- if (!m_deinterlace_filter) {
- m_deinterlace_filter.emplace(*frame);
- }
-
- // Add the frame we just received.
- if (!m_deinterlace_filter->AddSourceFrame(*frame)) {
- return;
- }
-
- // Pend output fields.
- while (true) {
- auto filter_frame = m_deinterlace_filter->DrainSinkFrame();
- if (!filter_frame) {
- break;
- }
-
- frame_queue.push(std::move(filter_frame));
- }
- }
+ return m_decoder_context->ReceiveFrame();
}
} // namespace FFmpeg
diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h
index 1de0bbd83..a74fcba80 100644
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@@ -20,17 +20,20 @@ extern "C" {
#endif
#include <libavcodec/avcodec.h>
-#include <libavfilter/avfilter.h>
-#include <libavfilter/buffersink.h>
-#include <libavfilter/buffersrc.h>
-#include <libavutil/avutil.h>
#include <libavutil/opt.h>
+#ifndef ANDROID
+#include <libavcodec/codec_internal.h>
+#endif
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
+namespace Tegra {
+class MemoryManager;
+}
+
namespace FFmpeg {
class Packet;
@@ -90,6 +93,10 @@ public:
return m_frame->data[plane];
}
+ const u8* GetPlane(int plane) const {
+ return m_frame->data[plane];
+ }
+
u8** GetPlanes() const {
return m_frame->data;
}
@@ -98,6 +105,14 @@ public:
m_frame->format = format;
}
+ bool IsInterlaced() const {
+ return m_frame->interlaced_frame != 0;
+ }
+
+ bool IsHardwareDecoded() const {
+ return m_frame->hw_frames_ctx != nullptr;
+ }
+
AVFrame* GetFrame() const {
return m_frame;
}
@@ -160,33 +175,22 @@ public:
void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt);
bool OpenContext(const Decoder& decoder);
bool SendPacket(const Packet& packet);
- std::unique_ptr<Frame> ReceiveFrame(bool* out_is_interlaced);
+ std::shared_ptr<Frame> ReceiveFrame();
AVCodecContext* GetCodecContext() const {
return m_codec_context;
}
-private:
- AVCodecContext* m_codec_context{};
-};
-
-// Wraps an AVFilterGraph.
-class DeinterlaceFilter {
-public:
- YUZU_NON_COPYABLE(DeinterlaceFilter);
- YUZU_NON_MOVEABLE(DeinterlaceFilter);
-
- explicit DeinterlaceFilter(const Frame& frame);
- ~DeinterlaceFilter();
-
- bool AddSourceFrame(const Frame& frame);
- std::unique_ptr<Frame> DrainSinkFrame();
+ bool UsingDecodeOrder() const {
+ return m_decode_order;
+ }
private:
- AVFilterGraph* m_filter_graph{};
- AVFilterContext* m_source_context{};
- AVFilterContext* m_sink_context{};
- bool m_initialized{};
+ const Decoder& m_decoder;
+ AVCodecContext* m_codec_context{};
+ s32 m_got_frame{};
+ std::shared_ptr<Frame> m_temp_frame{};
+ bool m_decode_order{};
};
class DecodeApi {
@@ -200,14 +204,17 @@ public:
bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
void Reset();
- bool SendPacket(std::span<const u8> packet_data, size_t configuration_size);
- void ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue);
+ bool UsingDecodeOrder() const {
+ return m_decoder_context->UsingDecodeOrder();
+ }
+
+ bool SendPacket(std::span<const u8> packet_data);
+ std::shared_ptr<Frame> ReceiveFrame();
private:
std::optional<FFmpeg::Decoder> m_decoder;
std::optional<FFmpeg::DecoderContext> m_decoder_context;
std::optional<FFmpeg::HardwareContext> m_hardware_context;
- std::optional<FFmpeg::DeinterlaceFilter> m_deinterlace_filter;
};
} // namespace FFmpeg
diff --git a/src/video_core/host1x/host1x.cpp b/src/video_core/host1x/host1x.cpp
index e923bfa22..293bca6d7 100644
--- a/src/video_core/host1x/host1x.cpp
+++ b/src/video_core/host1x/host1x.cpp
@@ -3,10 +3,10 @@
#include "core/core.h"
#include "video_core/host1x/host1x.h"
+#include "video_core/host1x/nvdec.h"
+#include "video_core/host1x/vic.h"
-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {
Host1x::Host1x(Core::System& system_)
: system{system_}, syncpoint_manager{},
@@ -15,6 +15,22 @@ Host1x::Host1x(Core::System& system_)
Host1x::~Host1x() = default;
-} // namespace Host1x
+void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) {
+ switch (type) {
+ case ChannelType::NvDec:
+ devices[fd] = std::make_unique<Tegra::Host1x::Nvdec>(*this, fd, syncpt, frame_queue);
+ break;
+ case ChannelType::VIC:
+ devices[fd] = std::make_unique<Tegra::Host1x::Vic>(*this, fd, syncpt, frame_queue);
+ break;
+ default:
+ LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast<u32>(type));
+ break;
+ }
+}
+
+void Host1x::StopDevice(s32 fd, ChannelType type) {
+ devices.erase(fd);
+}
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h
index d72d97b7b..8debac93d 100644
--- a/src/video_core/host1x/host1x.h
+++ b/src/video_core/host1x/host1x.h
@@ -3,9 +3,14 @@
#pragma once
+#include <unordered_map>
+#include <unordered_set>
+#include <queue>
+
#include "common/common_types.h"
#include "common/address_space.h"
+#include "video_core/cdma_pusher.h"
#include "video_core/host1x/gpu_device_memory_manager.h"
#include "video_core/host1x/syncpoint_manager.h"
#include "video_core/memory_manager.h"
@@ -14,15 +19,137 @@ namespace Core {
class System;
} // namespace Core
-namespace Tegra {
+namespace FFmpeg {
+class Frame;
+} // namespace FFmpeg
+
+namespace Tegra::Host1x {
+class Nvdec;
+
+class FrameQueue {
+public:
+ void Open(s32 fd) {
+ std::scoped_lock l{m_mutex};
+ m_presentation_order.insert({fd, {}});
+ m_decode_order.insert({fd, {}});
+ }
+
+ void Close(s32 fd) {
+ std::scoped_lock l{m_mutex};
+ m_presentation_order.erase(fd);
+ m_decode_order.erase(fd);
+ }
+
+ s32 VicFindNvdecFdFromOffset(u64 search_offset) {
+ std::scoped_lock l{m_mutex};
+ // Vic does not know which nvdec is producing frames for it, so search all the fds here for
+ // the given offset.
+ for (auto& map : m_presentation_order) {
+ for (auto& [offset, frame] : map.second) {
+ if (offset == search_offset) {
+ return map.first;
+ }
+ }
+ }
+
+ for (auto& map : m_decode_order) {
+ for (auto& [offset, frame] : map.second) {
+ if (offset == search_offset) {
+ return map.first;
+ }
+ }
+ }
+
+ return -1;
+ }
-namespace Host1x {
+ void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+ std::scoped_lock l{m_mutex};
+ auto map = m_presentation_order.find(fd);
+ if (map == m_presentation_order.end()) {
+ return;
+ }
+ map->second.emplace_back(offset, std::move(frame));
+ }
+
+ void PushDecodeOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+ std::scoped_lock l{m_mutex};
+ auto map = m_decode_order.find(fd);
+ if (map == m_decode_order.end()) {
+ return;
+ }
+ map->second.insert_or_assign(offset, std::move(frame));
+ }
+
+ std::shared_ptr<FFmpeg::Frame> GetFrame(s32 fd, u64 offset) {
+ if (fd == -1) {
+ return {};
+ }
+
+ std::scoped_lock l{m_mutex};
+ auto present_map = m_presentation_order.find(fd);
+ if (present_map != m_presentation_order.end() && present_map->second.size() > 0) {
+ return GetPresentOrderLocked(fd);
+ }
+
+ auto decode_map = m_decode_order.find(fd);
+ if (decode_map != m_decode_order.end() && decode_map->second.size() > 0) {
+ return GetDecodeOrderLocked(fd, offset);
+ }
+
+ return {};
+ }
+
+private:
+ std::shared_ptr<FFmpeg::Frame> GetPresentOrderLocked(s32 fd) {
+ auto map = m_presentation_order.find(fd);
+ if (map == m_presentation_order.end() || map->second.size() == 0) {
+ return {};
+ }
+ auto frame = std::move(map->second.front().second);
+ map->second.pop_front();
+ return frame;
+ }
+
+ std::shared_ptr<FFmpeg::Frame> GetDecodeOrderLocked(s32 fd, u64 offset) {
+ auto map = m_decode_order.find(fd);
+ if (map == m_decode_order.end() || map->second.size() == 0) {
+ return {};
+ }
+ auto it = map->second.find(offset);
+ if (it == map->second.end()) {
+ return {};
+ }
+ return std::move(map->second.extract(it).mapped());
+ }
+
+ using FramePtr = std::shared_ptr<FFmpeg::Frame>;
+
+ std::mutex m_mutex{};
+ std::unordered_map<s32, std::deque<std::pair<u64, FramePtr>>> m_presentation_order;
+ std::unordered_map<s32, std::unordered_map<u64, FramePtr>> m_decode_order;
+};
+
+enum class ChannelType : u32 {
+ MsEnc = 0,
+ VIC = 1,
+ GPU = 2,
+ NvDec = 3,
+ Display = 4,
+ NvJpg = 5,
+ TSec = 6,
+ Max = 7,
+};
class Host1x {
public:
explicit Host1x(Core::System& system);
~Host1x();
+ Core::System& System() {
+ return system;
+ }
+
SyncpointManager& GetSyncpointManager() {
return syncpoint_manager;
}
@@ -55,14 +182,25 @@ public:
return *allocator;
}
+ void StartDevice(s32 fd, ChannelType type, u32 syncpt);
+ void StopDevice(s32 fd, ChannelType type);
+
+ void PushEntries(s32 fd, ChCommandHeaderList&& entries) {
+ auto it = devices.find(fd);
+ if (it == devices.end()) {
+ return;
+ }
+ it->second->PushEntries(std::move(entries));
+ }
+
private:
Core::System& system;
SyncpointManager syncpoint_manager;
Tegra::MaxwellDeviceMemoryManager memory_manager;
Tegra::MemoryManager gmmu_manager;
std::unique_ptr<Common::FlatAllocator<u32, 0, 32>> allocator;
+ FrameQueue frame_queue;
+ std::unordered_map<s32, std::unique_ptr<CDmaPusher>> devices;
};
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp
index b8f5866d3..741a7d5c1 100644
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@@ -2,6 +2,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
+
+#include "common/polyfill_thread.h"
+#include "common/settings.h"
+#include "video_core/host1x/codecs/h264.h"
+#include "video_core/host1x/codecs/vp8.h"
+#include "video_core/host1x/codecs/vp9.h"
#include "video_core/host1x/host1x.h"
#include "video_core/host1x/nvdec.h"
@@ -10,37 +16,69 @@ namespace Tegra::Host1x {
#define NVDEC_REG_INDEX(field_name) \
(offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64))
-Nvdec::Nvdec(Host1x& host1x_)
- : host1x(host1x_), state{}, codec(std::make_unique<Codec>(host1x, state)) {}
+Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
+ : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, frame_queue{frame_queue_} {
+ LOG_INFO(HW_GPU, "Created nvdec {}", id);
+ frame_queue.Open(id);
+}
-Nvdec::~Nvdec() = default;
+Nvdec::~Nvdec() {
+ LOG_INFO(HW_GPU, "Destroying nvdec {}", id);
+}
void Nvdec::ProcessMethod(u32 method, u32 argument) {
- state.reg_array[method] = static_cast<u64>(argument) << 8;
+ regs.reg_array[method] = argument;
switch (method) {
case NVDEC_REG_INDEX(set_codec_id):
- codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument));
+ CreateDecoder(static_cast<NvdecCommon::VideoCodec>(argument));
break;
- case NVDEC_REG_INDEX(execute):
+ case NVDEC_REG_INDEX(execute): {
+ if (wait_needed) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(32));
+ wait_needed = false;
+ }
Execute();
- break;
+ } break;
}
}
-std::unique_ptr<FFmpeg::Frame> Nvdec::GetFrame() {
- return codec->GetCurrentFrame();
+void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
+ if (decoder.get()) {
+ return;
+ }
+ switch (codec) {
+ case NvdecCommon::VideoCodec::H264:
+ decoder = std::make_unique<Decoders::H264>(host1x, regs, id, frame_queue);
+ break;
+ case NvdecCommon::VideoCodec::VP8:
+ decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, frame_queue);
+ break;
+ case NvdecCommon::VideoCodec::VP9:
+ decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, frame_queue);
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
+ break;
+ }
+ LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id);
}
void Nvdec::Execute() {
- switch (codec->GetCurrentCodec()) {
+ if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
+ // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms
+ // execution time. Sleep for half of a 60 fps frame just in case.
+ std::this_thread::sleep_for(std::chrono::milliseconds(8));
+ return;
+ }
+ switch (decoder->GetCurrentCodec()) {
case NvdecCommon::VideoCodec::H264:
case NvdecCommon::VideoCodec::VP8:
case NvdecCommon::VideoCodec::VP9:
- codec->Decode();
+ decoder->Decode();
break;
default:
- UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName());
+ UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
break;
}
}
diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h
index ddddb8d28..565c65f66 100644
--- a/src/video_core/host1x/nvdec.h
+++ b/src/video_core/host1x/nvdec.h
@@ -5,33 +5,47 @@
#include <memory>
#include <vector>
+
#include "common/common_types.h"
-#include "video_core/host1x/codecs/codec.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/host1x/codecs/decoder.h"
namespace Tegra {
namespace Host1x {
-
class Host1x;
+class FrameQueue;
-class Nvdec {
+class Nvdec final : public CDmaPusher {
public:
- explicit Nvdec(Host1x& host1x);
+ explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue_);
~Nvdec();
/// Writes the method into the state, Invoke Execute() if encountered
- void ProcessMethod(u32 method, u32 argument);
+ void ProcessMethod(u32 method, u32 arg) override;
+
+ u32 GetSyncpoint() const {
+ return syncpoint;
+ }
- /// Return most recently decoded frame
- [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetFrame();
+ void SetWait() {
+ wait_needed = true;
+ }
private:
+ /// Create the decoder when the codec id is set
+ void CreateDecoder(NvdecCommon::VideoCodec codec);
+
/// Invoke codec to decode a frame
void Execute();
- Host1x& host1x;
- NvdecCommon::NvdecRegisters state;
- std::unique_ptr<Codec> codec;
+ s32 id;
+ u32 syncpoint;
+ FrameQueue& frame_queue;
+
+ NvdecCommon::NvdecRegisters regs{};
+ std::unique_ptr<Decoder> decoder;
+ bool wait_needed{false};
};
} // namespace Host1x
diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h
index 49d67ebbe..dfd8bb377 100644
--- a/src/video_core/host1x/nvdec_common.h
+++ b/src/video_core/host1x/nvdec_common.h
@@ -17,6 +17,17 @@ enum class VideoCodec : u64 {
VP9 = 0x9,
};
+struct Offset {
+ constexpr u64 Address() const noexcept {
+ return offset << 8;
+ }
+
+private:
+ u64 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!");
+
// NVDEC should use a 32-bit address space, but is mapped to 64-bit,
// doubling the sizes here is compensating for that.
struct NvdecRegisters {
@@ -38,29 +49,40 @@ struct NvdecRegisters {
BitField<17, 1, u64> all_intra_frame;
};
} control_params;
- u64 picture_info_offset; ///< 0x0808
- u64 frame_bitstream_offset; ///< 0x0810
- u64 frame_number; ///< 0x0818
- u64 h264_slice_data_offsets; ///< 0x0820
- u64 h264_mv_dump_offset; ///< 0x0828
- INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830
- u64 frame_stats_offset; ///< 0x0848
- u64 h264_last_surface_luma_offset; ///< 0x0850
- u64 h264_last_surface_chroma_offset; ///< 0x0858
- std::array<u64, 17> surface_luma_offset; ///< 0x0860
- std::array<u64, 17> surface_chroma_offset; ///< 0x08E8
- INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970
- u64 vp8_prob_data_offset; ///< 0x0A80
- u64 vp8_header_partition_buf_offset; ///< 0x0A88
- INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90
- u64 vp9_entropy_probs_offset; ///< 0x0B80
- u64 vp9_backward_updates_offset; ///< 0x0B88
- u64 vp9_last_frame_segmap_offset; ///< 0x0B90
- u64 vp9_curr_frame_segmap_offset; ///< 0x0B98
- INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0
- u64 vp9_last_frame_mvs_offset; ///< 0x0BA8
- u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0
- INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8
+ Offset picture_info_offset; ///< 0x0808
+ Offset frame_bitstream_offset; ///< 0x0810
+ u64 frame_number; ///< 0x0818
+ Offset h264_slice_data_offsets; ///< 0x0820
+ Offset h264_mv_dump_offset; ///< 0x0828
+ INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830
+ Offset frame_stats_offset; ///< 0x0848
+ Offset h264_last_surface_luma_offset; ///< 0x0850
+ Offset h264_last_surface_chroma_offset; ///< 0x0858
+ std::array<Offset, 17> surface_luma_offsets; ///< 0x0860
+ std::array<Offset, 17> surface_chroma_offsets; ///< 0x08E8
+ Offset pic_scratch_buf_offset; ///< 0x0970
+ Offset external_mvbuffer_offset; ///< 0x0978
+ INSERT_PADDING_WORDS_NOINIT(32); ///< 0x0980
+ Offset h264_mbhist_buffer_offset; ///< 0x0A00
+ INSERT_PADDING_WORDS_NOINIT(30); ///< 0x0A08
+ Offset vp8_prob_data_offset; ///< 0x0A80
+ Offset vp8_header_partition_buf_offset; ///< 0x0A88
+ INSERT_PADDING_WORDS_NOINIT(28); ///< 0x0A90
+ Offset hvec_scalist_list_offset; ///< 0x0B00
+ Offset hvec_tile_sizes_offset; ///< 0x0B08
+ Offset hvec_filter_buffer_offset; ///< 0x0B10
+ Offset hvec_sao_buffer_offset; ///< 0x0B18
+ Offset hvec_slice_info_buffer_offset; ///< 0x0B20
+ Offset hvec_slice_group_index_offset; ///< 0x0B28
+ INSERT_PADDING_WORDS_NOINIT(20); ///< 0x0B30
+ Offset vp9_prob_tab_buffer_offset; ///< 0x0B80
+ Offset vp9_ctx_counter_buffer_offset; ///< 0x0B88
+ Offset vp9_segment_read_buffer_offset; ///< 0x0B90
+ Offset vp9_segment_write_buffer_offset; ///< 0x0B98
+ Offset vp9_tile_size_buffer_offset; ///< 0x0BA0
+ Offset vp9_col_mvwrite_buffer_offset; ///< 0x0BA8
+ Offset vp9_col_mvread_buffer_offset; ///< 0x0BB0
+ Offset vp9_filter_buffer_offset; ///< 0x0BB8
};
std::array<u64, NUM_REGS> reg_array;
};
@@ -81,16 +103,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104);
ASSERT_REG_POSITION(frame_stats_offset, 0x109);
ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A);
ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B);
-ASSERT_REG_POSITION(surface_luma_offset, 0x10C);
-ASSERT_REG_POSITION(surface_chroma_offset, 0x11D);
+ASSERT_REG_POSITION(surface_luma_offsets, 0x10C);
+ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D);
ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150);
ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151);
-ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170);
-ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171);
-ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172);
-ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173);
-ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175);
-ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176);
+ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170);
+ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171);
+ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172);
+ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173);
+ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175);
+ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176);
#undef ASSERT_REG_POSITION
diff --git a/src/video_core/host1x/sync_manager.cpp b/src/video_core/host1x/sync_manager.cpp
deleted file mode 100644
index 5ef9ea217..000000000
--- a/src/video_core/host1x/sync_manager.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
-
-#include <algorithm>
-#include "sync_manager.h"
-#include "video_core/host1x/host1x.h"
-#include "video_core/host1x/syncpoint_manager.h"
-
-namespace Tegra {
-namespace Host1x {
-
-SyncptIncrManager::SyncptIncrManager(Host1x& host1x_) : host1x(host1x_) {}
-SyncptIncrManager::~SyncptIncrManager() = default;
-
-void SyncptIncrManager::Increment(u32 id) {
- increments.emplace_back(0, 0, id, true);
- IncrementAllDone();
-}
-
-u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
- const u32 handle = current_id++;
- increments.emplace_back(handle, class_id, id);
- return handle;
-}
-
-void SyncptIncrManager::SignalDone(u32 handle) {
- const auto done_incr =
- std::find_if(increments.begin(), increments.end(),
- [handle](const SyncptIncr& incr) { return incr.id == handle; });
- if (done_incr != increments.cend()) {
- done_incr->complete = true;
- }
- IncrementAllDone();
-}
-
-void SyncptIncrManager::IncrementAllDone() {
- std::size_t done_count = 0;
- for (; done_count < increments.size(); ++done_count) {
- if (!increments[done_count].complete) {
- break;
- }
- auto& syncpoint_manager = host1x.GetSyncpointManager();
- syncpoint_manager.IncrementGuest(increments[done_count].syncpt_id);
- syncpoint_manager.IncrementHost(increments[done_count].syncpt_id);
- }
- increments.erase(increments.begin(), increments.begin() + done_count);
-}
-
-} // namespace Host1x
-} // namespace Tegra
diff --git a/src/video_core/host1x/sync_manager.h b/src/video_core/host1x/sync_manager.h
deleted file mode 100644
index 7bb77fa27..000000000
--- a/src/video_core/host1x/sync_manager.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "common/common_types.h"
-
-namespace Tegra {
-
-namespace Host1x {
-
-class Host1x;
-
-struct SyncptIncr {
- u32 id;
- u32 class_id;
- u32 syncpt_id;
- bool complete;
-
- SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
- : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
-};
-
-class SyncptIncrManager {
-public:
- explicit SyncptIncrManager(Host1x& host1x);
- ~SyncptIncrManager();
-
- /// Add syncpoint id and increment all
- void Increment(u32 id);
-
- /// Returns a handle to increment later
- u32 IncrementWhenDone(u32 class_id, u32 id);
-
- /// IncrememntAllDone, including handle
- void SignalDone(u32 handle);
-
- /// Increment all sequential pending increments that are already done.
- void IncrementAllDone();
-
-private:
- std::vector<SyncptIncr> increments;
- std::mutex increment_lock;
- u32 current_id{};
-
- Host1x& host1x;
-};
-
-} // namespace Host1x
-
-} // namespace Tegra
diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp
index 8f23ce527..8f51c92af 100644
--- a/src/video_core/host1x/syncpoint_manager.cpp
+++ b/src/video_core/host1x/syncpoint_manager.cpp
@@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
return {};
}
- std::unique_lock lk(guard);
+ std::scoped_lock lk(guard);
if (syncpoint.load(std::memory_order_relaxed) >= expected_value) {
action();
return {};
@@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage,
const ActionHandle& handle) {
- std::unique_lock lk(guard);
+ std::scoped_lock lk(guard);
// We want to ensure the iterator still exists prior to erasing it
// Otherwise, if an invalid iterator was passed in then it could lead to UB
@@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic<u32>& syncpoint, std::condition_var
std::list<RegisteredAction>& action_storage) {
auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1};
- std::unique_lock lk(guard);
+ std::scoped_lock lk(guard);
auto it = action_storage.begin();
while (it != action_storage.end()) {
if (it->expected_value > new_value) {
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index d154746af..3ad56bb80 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -2,6 +2,21 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <array>
+#include <tuple>
+#include <stdint.h>
+
+#if defined(ARCHITECTURE_x86_64)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+#elif defined(ARCHITECTURE_arm64)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
+#include <sse2neon.h>
+#pragma GCC diagnostic pop
+#endif
extern "C" {
#if defined(__GNUC__) || defined(__clang__)
@@ -14,228 +29,1231 @@ extern "C" {
#endif
}
+#include "common/alignment.h"
#include "common/assert.h"
#include "common/bit_field.h"
#include "common/logging/log.h"
+#include "common/polyfill_thread.h"
+#include "common/settings.h"
#include "video_core/engines/maxwell_3d.h"
+#include "video_core/guest_memory.h"
#include "video_core/host1x/host1x.h"
#include "video_core/host1x/nvdec.h"
#include "video_core/host1x/vic.h"
#include "video_core/memory_manager.h"
#include "video_core/textures/decoders.h"
-namespace Tegra {
-
-namespace Host1x {
+#if defined(ARCHITECTURE_x86_64)
+#include "common/x64/cpu_detect.h"
+#endif
+namespace Tegra::Host1x {
namespace {
-enum class VideoPixelFormat : u64_le {
- RGBA8 = 0x1f,
- BGRA8 = 0x20,
- RGBX8 = 0x23,
- YUV420 = 0x44,
-};
-} // Anonymous namespace
-
-union VicConfig {
- u64_le raw{};
- BitField<0, 7, VideoPixelFormat> pixel_format;
- BitField<7, 2, u64_le> chroma_loc_horiz;
- BitField<9, 2, u64_le> chroma_loc_vert;
- BitField<11, 4, u64_le> block_linear_kind;
- BitField<15, 4, u64_le> block_linear_height_log2;
- BitField<32, 14, u64_le> surface_width_minus1;
- BitField<46, 14, u64_le> surface_height_minus1;
-};
-
-Vic::Vic(Host1x& host1x_, std::shared_ptr<Nvdec> nvdec_processor_)
- : host1x(host1x_),
- nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {}
-
-Vic::~Vic() = default;
-
-void Vic::ProcessMethod(Method method, u32 argument) {
- LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
- const u64 arg = static_cast<u64>(argument) << 8;
- switch (method) {
- case Method::Execute:
+static bool HasSSE41() {
+#if defined(ARCHITECTURE_x86_64)
+ const auto& cpu_caps{Common::GetCPUCaps()};
+ return cpu_caps.sse4_1;
+#else
+ return false;
+#endif
+}
+
+void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride,
+ u32 height) {
+ /*
+ * Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
+ * Can only handle block height == 1.
+ */
+ const uint32_t x_mask = 0xFFFFFFD2u;
+ const uint32_t y_mask = 0x2Cu;
+ uint32_t offs_x{};
+ uint32_t offs_y{};
+ uint32_t offs_line{};
+
+ for (u32 y = 0; y < height; y += 2) {
+ auto dst_line = output.data() + offs_y * 16;
+ const auto src_line = input.data() + y * (in_stride / 16) * 16;
+
+ offs_line = offs_x;
+ for (u32 x = 0; x < in_stride; x += 16) {
+ std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16);
+ std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16);
+ offs_line = (offs_line - x_mask) & x_mask;
+ }
+
+ offs_y = (offs_y - y_mask) & y_mask;
+
+ /* Wrap into next tile row */
+ if (!offs_y) {
+ offs_x += out_stride;
+ }
+ }
+}
+
+} // namespace
+
+Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
+ : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt},
+ frame_queue{frame_queue_}, has_sse41{HasSSE41()} {
+ LOG_INFO(HW_GPU, "Created vic {}", id);
+}
+
+Vic::~Vic() {
+ LOG_INFO(HW_GPU, "Destroying vic {}", id);
+ frame_queue.Close(id);
+}
+
+void Vic::ProcessMethod(u32 method, u32 arg) {
+ LOG_TRACE(HW_GPU, "Vic {} method 0x{:X}", id, static_cast<u32>(method));
+ regs.reg_array[method] = arg;
+
+ switch (static_cast<Method>(method * sizeof(u32))) {
+ case Method::Execute: {
Execute();
+ } break;
+ default:
break;
- case Method::SetConfigStructOffset:
- config_struct_address = arg;
+ }
+}
+
+void Vic::Execute() {
+ ConfigStruct config{};
+ memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct));
+
+ auto output_width{config.output_surface_config.out_surface_width + 1};
+ auto output_height{config.output_surface_config.out_surface_height + 1};
+ output_surface.resize_destructive(output_width * output_height);
+
+ if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
+ // Fill the frame with black, as otherwise they can have random data and be very glitchy.
+ std::fill(output_surface.begin(), output_surface.end(), Pixel{});
+ } else {
+ for (size_t i = 0; i < config.slot_structs.size(); i++) {
+ auto& slot_config{config.slot_structs[i]};
+ if (!slot_config.config.slot_enable) {
+ continue;
+ }
+
+ auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()};
+ if (nvdec_id == -1) {
+ nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset);
+ }
+
+ auto frame = frame_queue.GetFrame(nvdec_id, luma_offset);
+ if (!frame.get()) {
+ LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset 0x{:X}", id, luma_offset);
+ continue;
+ }
+
+ switch (frame->GetPixelFormat()) {
+ case AV_PIX_FMT_YUV420P:
+ ReadY8__V8U8_N420<true>(slot_config, regs.surfaces[i], std::move(frame));
+ break;
+ case AV_PIX_FMT_NV12:
+ ReadY8__V8U8_N420<false>(slot_config, regs.surfaces[i], std::move(frame));
+ break;
+ default:
+ UNIMPLEMENTED_MSG(
+ "Unimplemented slot pixel format {}",
+ static_cast<u32>(slot_config.surface_config.slot_pixel_format.Value()));
+ break;
+ }
+
+ Blend(config, slot_config);
+ }
+ }
+
+ switch (config.output_surface_config.out_pixel_format) {
+ case VideoPixelFormat::A8B8G8R8:
+ case VideoPixelFormat::X8B8G8R8:
+ WriteABGR<VideoPixelFormat::A8B8G8R8>(config.output_surface_config);
break;
- case Method::SetOutputSurfaceLumaOffset:
- output_surface_luma_address = arg;
+ case VideoPixelFormat::A8R8G8B8:
+ WriteABGR<VideoPixelFormat::A8R8G8B8>(config.output_surface_config);
break;
- case Method::SetOutputSurfaceChromaOffset:
- output_surface_chroma_address = arg;
+ case VideoPixelFormat::Y8__V8U8_N420:
+ WriteY8__V8U8_N420(config.output_surface_config);
break;
default:
+ UNIMPLEMENTED_MSG("Unknown video pixel format {}",
+ config.output_surface_config.out_pixel_format.Value());
break;
}
}
-void Vic::Execute() {
- if (output_surface_luma_address == 0) {
- LOG_ERROR(Service_NVDRV, "VIC Luma address not set.");
- return;
+template <bool Planar, bool Interlaced>
+void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
+ std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame) {
+ const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
+ auto out_luma_height{slot.surface_config.slot_surface_height + 1};
+ const auto out_luma_stride{out_luma_width};
+
+ if constexpr (Interlaced) {
+ out_luma_height *= 2;
}
- const VicConfig config{host1x.GMMU().Read<u64>(config_struct_address + 0x20)};
- auto frame = nvdec_processor->GetFrame();
- if (!frame) {
+
+ slot_surface.resize_destructive(out_luma_width * out_luma_height);
+
+ const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+ const auto in_luma_height{std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+ const auto in_luma_stride{frame->GetStride(0)};
+
+ const auto in_chroma_stride{frame->GetStride(1)};
+
+ const auto* luma_buffer{frame->GetPlane(0)};
+ const auto* chroma_u_buffer{frame->GetPlane(1)};
+ const auto* chroma_v_buffer{frame->GetPlane(2)};
+
+ LOG_TRACE(HW_GPU,
+ "Reading frame"
+ "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
+ "output luma {}x{} stride {} chroma {}x{} stride {}",
+ in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2,
+ in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width,
+ out_luma_height, out_luma_stride);
+
+ [[maybe_unused]] auto DecodeLinear = [&]() {
+ const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
+
+ for (s32 y = 0; y < in_luma_height; y++) {
+ const auto src_luma{y * in_luma_stride};
+ const auto src_chroma{(y / 2) * in_chroma_stride};
+ const auto dst{y * out_luma_stride};
+ for (s32 x = 0; x < in_luma_width; x++) {
+ slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+ // Chroma samples are duplicated horizontally and vertically.
+ if constexpr (Planar) {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+ } else {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+ }
+ slot_surface[dst + x].a = alpha;
+ }
+ }
+ };
+
+#if defined(ARCHITECTURE_x86_64)
+ if (!has_sse41) {
+ DecodeLinear();
return;
}
- const u64 surface_width = config.surface_width_minus1 + 1;
- const u64 surface_height = config.surface_height_minus1 + 1;
- if (static_cast<u64>(frame->GetWidth()) != surface_width ||
- static_cast<u64>(frame->GetHeight()) != surface_height) {
- // TODO: Properly support multiple video streams with differing frame dimensions
- LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}",
- frame->GetWidth(), frame->GetHeight(), surface_width, surface_height);
+#endif
+
+#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
+ const auto alpha_linear{static_cast<u16>(slot.config.planar_alpha.Value())};
+ const auto alpha =
+ _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48);
+
+ const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0);
+ const auto sse_aligned_width = Common::AlignDown(in_luma_width, 16);
+
+ for (s32 y = 0; y < in_luma_height; y++) {
+ const auto src_luma{y * in_luma_stride};
+ const auto src_chroma{(y / 2) * in_chroma_stride};
+ const auto dst{y * out_luma_stride};
+ s32 x = 0;
+ for (; x < sse_aligned_width; x += 16) {
+ // clang-format off
+ // Prefetch next iteration's memory
+ _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0);
+
+ // Load 8 bytes * 2 of 8-bit luma samples
+ // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL
+ auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]);
+ auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]);
+
+ __m128i chroma;
+
+ if constexpr (Planar) {
+ _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+ _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+
+ // If Chroma is planar, we have separate U and V planes, load 8 bytes of each
+ // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU
+ // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV
+ auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]);
+ auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]);
+
+ // Interleave the 8 bytes of U and V into a single 16 byte reg
+ // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
+ chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0);
+ } else {
+ _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+
+ // Chroma is already interleaved in semiplanar format, just load 16 bytes
+ // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
+ chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]);
+ }
+
+ // Convert the low 8 bytes of 8-bit luma into 16-bit luma
+ // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL]
+ // ->
+ // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL]
+ luma0 = _mm_cvtepu8_epi16(luma0);
+ luma1 = _mm_cvtepu8_epi16(luma1);
+
+ // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the
+ // U and V together as one element. Using chroma twice here duplicates the values, as we
+ // take element 0 from chroma, and then element 0 from chroma again, etc. We need to
+ // duplicate chroma horitonally as chroma is half the width of luma.
+ // chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
+ // ->
+ // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1]
+ // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5]
+ auto chroma00 = _mm_unpacklo_epi16(chroma, chroma);
+ auto chroma01 = _mm_unpackhi_epi16(chroma, chroma);
+
+ // Interleave the 16-bit luma and chroma.
+ // luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1]
+ // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
+ // ->
+ // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
+ // yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5]
+ auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00);
+ auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00);
+ auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01);
+ auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01);
+
+ // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of
+ // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the
+ // alpha. Luma -> R, U -> G, V -> B, 0 -> A
+ // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
+ // ->
+ // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1]
+ yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask);
+ yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask);
+ yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask);
+ yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask);
+
+ // Extend the 8-bit channels we have into 16-bits, as that's the target surface format.
+ // Since this turns just the low 8 bytes into 16 bytes, the second of
+ // each operation here right shifts the register by 8 to get the high pixels.
+ // yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1]
+ // ->
+ // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1]
+ // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3]
+ auto yuv01 = _mm_cvtepu8_epi16(yuv0);
+ auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8));
+ auto yuv45 = _mm_cvtepu8_epi16(yuv1);
+ auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8));
+ auto yuv89 = _mm_cvtepu8_epi16(yuv2);
+ auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8));
+ auto yuv1213 = _mm_cvtepu8_epi16(yuv3);
+ auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8));
+
+ // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead
+ // of 8, which is the format alpha is in, as well as other blending values.
+ yuv01 = _mm_slli_epi16(yuv01, 2);
+ yuv23 = _mm_slli_epi16(yuv23, 2);
+ yuv45 = _mm_slli_epi16(yuv45, 2);
+ yuv67 = _mm_slli_epi16(yuv67, 2);
+ yuv89 = _mm_slli_epi16(yuv89, 2);
+ yuv1011 = _mm_slli_epi16(yuv1011, 2);
+ yuv1213 = _mm_slli_epi16(yuv1213, 2);
+ yuv1415 = _mm_slli_epi16(yuv1415, 2);
+
+ // OR in the planar alpha, this has already been duplicated and shifted into position,
+ // and just fills in the AA channels with the actual alpha value.
+ yuv01 = _mm_or_si128(yuv01, alpha);
+ yuv23 = _mm_or_si128(yuv23, alpha);
+ yuv45 = _mm_or_si128(yuv45, alpha);
+ yuv67 = _mm_or_si128(yuv67, alpha);
+ yuv89 = _mm_or_si128(yuv89, alpha);
+ yuv1011 = _mm_or_si128(yuv1011, alpha);
+ yuv1213 = _mm_or_si128(yuv1213, alpha);
+ yuv1415 = _mm_or_si128(yuv1415, alpha);
+
+ // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels.
+ // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL]
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213);
+ _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415);
+
+ // clang-format on
+ }
+
+ for (; x < in_luma_width; x++) {
+ slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+ // Chroma samples are duplicated horizontally and vertically.
+ if constexpr (Planar) {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+ } else {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+ }
+ slot_surface[dst + x].a = alpha_linear;
+ }
+ }
+#else
+ DecodeLinear();
+#endif
+}
+
+template <bool Planar, bool TopField>
+void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame) {
+ if constexpr (!Planar) {
+ ReadProgressiveY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+ return;
}
- switch (config.pixel_format) {
- case VideoPixelFormat::RGBA8:
- case VideoPixelFormat::BGRA8:
- case VideoPixelFormat::RGBX8:
- WriteRGBFrame(std::move(frame), config);
+ const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
+ const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
+ const auto out_luma_stride{out_luma_width};
+
+ slot_surface.resize_destructive(out_luma_width * out_luma_height);
+
+ const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+ [[maybe_unused]] const auto in_luma_height{
+ std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+ const auto in_luma_stride{frame->GetStride(0)};
+
+ [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2};
+ const auto in_chroma_height{(frame->GetHeight() + 1) / 2};
+ const auto in_chroma_stride{frame->GetStride(1)};
+
+ const auto* luma_buffer{frame->GetPlane(0)};
+ const auto* chroma_u_buffer{frame->GetPlane(1)};
+ const auto* chroma_v_buffer{frame->GetPlane(2)};
+
+ LOG_TRACE(HW_GPU,
+ "Reading frame"
+ "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
+ "output luma {}x{} stride {} chroma {}x{} stride {}",
+ in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height,
+ in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride,
+ out_luma_width / 2, out_luma_height / 2, out_luma_stride);
+
+ [[maybe_unused]] auto DecodeLinear = [&]() {
+ auto DecodeBobField = [&]() {
+ const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
+
+ for (s32 y = static_cast<s32>(TopField == false); y < in_chroma_height * 2; y += 2) {
+ const auto src_luma{y * in_luma_stride};
+ const auto src_chroma{(y / 2) * in_chroma_stride};
+ const auto dst{y * out_luma_stride};
+ for (s32 x = 0; x < in_luma_width; x++) {
+ slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+ if constexpr (Planar) {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+ } else {
+ slot_surface[dst + x].g =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+ slot_surface[dst + x].b =
+ static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+ }
+ slot_surface[dst + x].a = alpha;
+ }
+
+ s32 other_line{};
+ if constexpr (TopField) {
+ other_line = (y + 1) * out_luma_stride;
+ } else {
+ other_line = (y - 1) * out_luma_stride;
+ }
+ std::memcpy(&slot_surface[other_line], &slot_surface[dst],
+ out_luma_width * sizeof(Pixel));
+ }
+ };
+
+ switch (slot.config.deinterlace_mode) {
+ case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE:
+ // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it
+ // relies on the previous frame.
+ DecodeBobField();
+ break;
+ case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD:
+ DecodeBobField();
+ break;
+ case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1:
+ // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it
+ // relies on previous/next frames.
+ DecodeBobField();
+ break;
+ default:
+ UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!",
+ static_cast<s32>(slot.config.deinterlace_mode.Value()));
+ break;
+ }
+ };
+
+ DecodeLinear();
+}
+
+template <bool Planar>
+void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame) {
+ switch (slot.config.frame_format) {
+ case DXVAHD_FRAME_FORMAT::PROGRESSIVE:
+ ReadProgressiveY8__V8U8_N420<Planar>(slot, offsets, std::move(frame));
break;
- case VideoPixelFormat::YUV420:
- WriteYUVFrame(std::move(frame), config);
+ case DXVAHD_FRAME_FORMAT::TOP_FIELD:
+ ReadInterlacedY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+ break;
+ case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD:
+ ReadInterlacedY8__V8U8_N420<Planar, false>(slot, offsets, std::move(frame));
break;
default:
- UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value());
+ LOG_ERROR(HW_GPU, "Unknown deinterlace format {}",
+ static_cast<s32>(slot.config.frame_format.Value()));
break;
}
}
-void Vic::WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
- LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
-
- const auto frame_width = frame->GetWidth();
- const auto frame_height = frame->GetHeight();
- const auto frame_format = frame->GetPixelFormat();
-
- if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) {
- const AVPixelFormat target_format = [pixel_format = config.pixel_format]() {
- switch (pixel_format) {
- case VideoPixelFormat::RGBA8:
- return AV_PIX_FMT_RGBA;
- case VideoPixelFormat::BGRA8:
- return AV_PIX_FMT_BGRA;
- case VideoPixelFormat::RGBX8:
- return AV_PIX_FMT_RGB0;
- default:
- return AV_PIX_FMT_RGBA;
- }
- }();
+void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
+ constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; });
- sws_freeContext(scaler_ctx);
- // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format
- scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width,
- frame_height, target_format, 0, nullptr, nullptr, nullptr);
- scaler_width = frame_width;
- scaler_height = frame_height;
- converted_frame_buffer.reset();
- }
- if (!converted_frame_buffer) {
- const size_t frame_size = frame_width * frame_height * 4;
- converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free};
+ auto source_left{add_one(static_cast<u32>(slot.config.source_rect_left.Value()))};
+ auto source_right{add_one(static_cast<u32>(slot.config.source_rect_right.Value()))};
+ auto source_top{add_one(static_cast<u32>(slot.config.source_rect_top.Value()))};
+ auto source_bottom{add_one(static_cast<u32>(slot.config.source_rect_bottom.Value()))};
+
+ const auto dest_left{add_one(static_cast<u32>(slot.config.dest_rect_left.Value()))};
+ const auto dest_right{add_one(static_cast<u32>(slot.config.dest_rect_right.Value()))};
+ const auto dest_top{add_one(static_cast<u32>(slot.config.dest_rect_top.Value()))};
+ const auto dest_bottom{add_one(static_cast<u32>(slot.config.dest_rect_bottom.Value()))};
+
+ auto rect_left{add_one(config.output_config.target_rect_left.Value())};
+ auto rect_right{add_one(config.output_config.target_rect_right.Value())};
+ auto rect_top{add_one(config.output_config.target_rect_top.Value())};
+ auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())};
+
+ rect_left = std::max(rect_left, dest_left);
+ rect_right = std::min(rect_right, dest_right);
+ rect_top = std::max(rect_top, dest_top);
+ rect_bottom = std::min(rect_bottom, dest_bottom);
+
+ source_left = std::max(source_left, rect_left);
+ source_right = std::min(source_right, rect_right);
+ source_top = std::max(source_top, rect_top);
+ source_bottom = std::min(source_bottom, rect_bottom);
+
+ if (source_left >= source_right || source_top >= source_bottom) {
+ return;
}
- const std::array<int, 4> converted_stride{frame_width * 4, frame_height * 4, 0, 0};
- u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
- sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height,
- &converted_frame_buf_addr, converted_stride.data());
-
- // Use the minimum of surface/frame dimensions to avoid buffer overflow.
- const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1;
- const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1;
- const u32 width = std::min(surface_width, static_cast<u32>(frame_width));
- const u32 height = std::min(surface_height, static_cast<u32>(frame_height));
- const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
- if (blk_kind != 0) {
- // swizzle pitch linear to block linear
- const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
- const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
- luma_buffer.resize_destructive(size);
- std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
- Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height,
- block_height, 0, width * 4);
-
- host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
+
+ const auto out_surface_width{config.output_surface_config.out_surface_width + 1};
+ [[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height +
+ 1};
+ const auto in_surface_width{slot.surface_config.slot_surface_width + 1};
+
+ source_bottom = std::min(source_bottom, out_surface_height);
+ source_right = std::min(source_right, out_surface_width);
+
+ // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
+ // below max, so it's ignored for now.
+
+ if (!slot.color_matrix.matrix_enable) {
+ const auto copy_width = std::min(source_right - source_left, rect_right - rect_left);
+
+ for (u32 y = source_top; y < source_bottom; y++) {
+ const auto dst_line = y * out_surface_width;
+ const auto src_line = y * in_surface_width;
+ std::memcpy(&output_surface[dst_line + rect_left],
+ &slot_surface[src_line + source_left], copy_width * sizeof(Pixel));
+ }
} else {
- // send pitch linear frame
- const size_t linear_size = width * height * 4;
- host1x.GMMU().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
- linear_size);
+ // clang-format off
+ // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
+ // | r0c0 r0c1 r0c2 r0c3 | | R | | R |
+ // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
+ // | r2c0 r2c1 r2c2 r2c3 | | B | | B |
+ // | 1 |
+ // clang-format on
+
+ [[maybe_unused]] auto DecodeLinear = [&]() {
+ const auto r0c0 = static_cast<s32>(slot.color_matrix.matrix_coeff00.Value());
+ const auto r0c1 = static_cast<s32>(slot.color_matrix.matrix_coeff01.Value());
+ const auto r0c2 = static_cast<s32>(slot.color_matrix.matrix_coeff02.Value());
+ const auto r0c3 = static_cast<s32>(slot.color_matrix.matrix_coeff03.Value());
+ const auto r1c0 = static_cast<s32>(slot.color_matrix.matrix_coeff10.Value());
+ const auto r1c1 = static_cast<s32>(slot.color_matrix.matrix_coeff11.Value());
+ const auto r1c2 = static_cast<s32>(slot.color_matrix.matrix_coeff12.Value());
+ const auto r1c3 = static_cast<s32>(slot.color_matrix.matrix_coeff13.Value());
+ const auto r2c0 = static_cast<s32>(slot.color_matrix.matrix_coeff20.Value());
+ const auto r2c1 = static_cast<s32>(slot.color_matrix.matrix_coeff21.Value());
+ const auto r2c2 = static_cast<s32>(slot.color_matrix.matrix_coeff22.Value());
+ const auto r2c3 = static_cast<s32>(slot.color_matrix.matrix_coeff23.Value());
+
+ const auto shift = static_cast<s32>(slot.color_matrix.matrix_r_shift.Value());
+ const auto clamp_min = static_cast<s32>(slot.config.soft_clamp_low.Value());
+ const auto clamp_max = static_cast<s32>(slot.config.soft_clamp_high.Value());
+
+ auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> {
+ auto r = static_cast<s32>(in_pixel.r);
+ auto g = static_cast<s32>(in_pixel.g);
+ auto b = static_cast<s32>(in_pixel.b);
+
+ r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2;
+ g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2;
+ b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2;
+
+ r >>= shift;
+ g >>= shift;
+ b >>= shift;
+
+ r += r0c3;
+ g += r1c3;
+ b += r2c3;
+
+ r >>= 8;
+ g >>= 8;
+ b >>= 8;
+
+ return {r, g, b, static_cast<s32>(in_pixel.a)};
+ };
+
+ for (u32 y = source_top; y < source_bottom; y++) {
+ const auto src{y * in_surface_width + source_left};
+ const auto dst{y * out_surface_width + rect_left};
+ for (u32 x = source_left; x < source_right; x++) {
+ auto [r, g, b, a] = MatMul(slot_surface[src + x]);
+
+ r = std::clamp(r, clamp_min, clamp_max);
+ g = std::clamp(g, clamp_min, clamp_max);
+ b = std::clamp(b, clamp_min, clamp_max);
+ a = std::clamp(a, clamp_min, clamp_max);
+
+ output_surface[dst + x] = {static_cast<u16>(r), static_cast<u16>(g),
+ static_cast<u16>(b), static_cast<u16>(a)};
+ }
+ }
+ };
+
+#if defined(ARCHITECTURE_x86_64)
+ if (!has_sse41) {
+ DecodeLinear();
+ return;
+ }
+#endif
+
+#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
+ // Fill the columns, e.g
+ // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
+
+ const auto c0 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff20.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff10.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff00.Value()));
+ const auto c1 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff21.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff11.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff01.Value()));
+ const auto c2 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff22.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff12.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff02.Value()));
+ const auto c3 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff23.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff13.Value()),
+ static_cast<s32>(slot.color_matrix.matrix_coeff03.Value()));
+
+ // Set the matrix right-shift as a single element.
+ const auto shift =
+ _mm_set_epi32(0, 0, 0, static_cast<s32>(slot.color_matrix.matrix_r_shift.Value()));
+
+ // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel.
+ const auto clamp_min = _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_low.Value()));
+ const auto clamp_max =
+ _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_high.Value()));
+
+ // clang-format off
+
+ auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2,
+ const __m128i& col3, const __m128i& trm_shift) -> __m128i {
+ // Duplicate the 32-bit channels, e.g
+ // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR]
+ // ->
+ // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1]
+ auto r = _mm_shuffle_epi32(p, 0x0);
+ auto g = _mm_shuffle_epi32(p, 0x55);
+ auto b = _mm_shuffle_epi32(p, 0xAA);
+
+ // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g
+ // r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1]
+ // *
+ // c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
+ r = _mm_mullo_epi32(r, col0);
+ g = _mm_mullo_epi32(g, col1);
+ b = _mm_mullo_epi32(b, col2);
+
+ // Add them all together vertically, such that the 32-bit element
+ // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0])
+ auto out = _mm_add_epi32(_mm_add_epi32(r, g), b);
+
+ // Shift the result by r_shift, as the TRM says
+ out = _mm_sra_epi32(out, trm_shift);
+
+ // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to
+ // multiply by it, and as per the TRM this column ignores r_shift, so it's just added
+ // here after shifting.
+ out = _mm_add_epi32(out, col3);
+
+ // Shift the result back from S12.8 to integer values
+ return _mm_srai_epi32(out, 8);
+ };
+
+ for (u32 y = source_top; y < source_bottom; y++) {
+ const auto src{y * in_surface_width + source_left};
+ const auto dst{y * out_surface_width + rect_left};
+ for (u32 x = source_left; x < source_right; x += 8) {
+ // clang-format off
+ // Prefetch the next iteration's memory
+ _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0);
+
+ // Load in pixels
+ // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR]
+ auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]);
+ auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]);
+ auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]);
+ auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]);
+
+ // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are
+ // 32-bit and to avoid overflow.
+ // p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+ // ->
+ // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
+ // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
+ auto p01_lo = _mm_cvtepu16_epi32(p01);
+ auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8));
+ auto p23_lo = _mm_cvtepu16_epi32(p23);
+ auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8));
+ auto p45_lo = _mm_cvtepu16_epi32(p45);
+ auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8));
+ auto p67_lo = _mm_cvtepu16_epi32(p67);
+ auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8));
+
+ // Matrix multiply the pixel, doing the colour conversion.
+ auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift);
+ auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift);
+ auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift);
+ auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift);
+ auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift);
+ auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift);
+ auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift);
+ auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift);
+
+ // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation
+ // out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
+ // out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
+ // ->
+ // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+ auto done0 = _mm_packus_epi32(out0, out1);
+ auto done1 = _mm_packus_epi32(out2, out3);
+ auto done2 = _mm_packus_epi32(out4, out5);
+ auto done3 = _mm_packus_epi32(out6, out7);
+
+ // Blend the original alpha back into the pixel, as the matrix multiply gives us a
+ // 3-channel output, not 4.
+ // 0x88 = b10001000, taking RGB from the first argument, A from the second argument.
+ // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+ // ->
+ // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+ done0 = _mm_blend_epi16(done0, p01, 0x88);
+ done1 = _mm_blend_epi16(done1, p23, 0x88);
+ done2 = _mm_blend_epi16(done2, p45, 0x88);
+ done3 = _mm_blend_epi16(done3, p67, 0x88);
+
+ // Clamp the 16-bit channels to the soft-clamp min/max.
+ done0 = _mm_max_epu16(done0, clamp_min);
+ done1 = _mm_max_epu16(done1, clamp_min);
+ done2 = _mm_max_epu16(done2, clamp_min);
+ done3 = _mm_max_epu16(done3, clamp_min);
+
+ done0 = _mm_min_epu16(done0, clamp_max);
+ done1 = _mm_min_epu16(done1, clamp_max);
+ done2 = _mm_min_epu16(done2, clamp_max);
+ done3 = _mm_min_epu16(done3, clamp_max);
+
+ // Store the pixels to the output surface.
+ _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0);
+ _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1);
+ _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2);
+ _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3);
+
+ }
+ }
+ // clang-format on
+#else
+ DecodeLinear();
+#endif
}
}
-void Vic::WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
- LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
+ constexpr u32 BytesPerPixel = 1;
- const std::size_t surface_width = config.surface_width_minus1 + 1;
- const std::size_t surface_height = config.surface_height_minus1 + 1;
- const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
- // Use the minimum of surface/frame dimensions to avoid buffer overflow.
- const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->GetWidth()));
- const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->GetHeight()));
+ auto surface_width{output_surface_config.out_surface_width + 1};
+ auto surface_height{output_surface_config.out_surface_height + 1};
+ const auto surface_stride{surface_width};
- const auto stride = static_cast<size_t>(frame->GetStride(0));
+ const auto out_luma_width = output_surface_config.out_luma_width + 1;
+ const auto out_luma_height = output_surface_config.out_luma_height + 1;
+ const auto out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10);
+ const auto out_luma_size = out_luma_height * out_luma_stride;
- luma_buffer.resize_destructive(aligned_width * surface_height);
- chroma_buffer.resize_destructive(aligned_width * surface_height / 2);
+ const auto out_chroma_width = output_surface_config.out_chroma_width + 1;
+ const auto out_chroma_height = output_surface_config.out_chroma_height + 1;
+ const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10);
+ const auto out_chroma_size = out_chroma_height * out_chroma_stride;
- // Populate luma buffer
- const u8* luma_src = frame->GetData(0);
- for (std::size_t y = 0; y < frame_height; ++y) {
- const std::size_t src = y * stride;
- const std::size_t dst = y * aligned_width;
- std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width);
- }
- host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size());
-
- // Chroma
- const std::size_t half_height = frame_height / 2;
- const auto half_stride = static_cast<size_t>(frame->GetStride(1));
-
- switch (frame->GetPixelFormat()) {
- case AV_PIX_FMT_YUV420P: {
- // Frame from FFmpeg software
- // Populate chroma buffer from both channels with interleaving.
- const std::size_t half_width = frame_width / 2;
- u8* chroma_buffer_data = chroma_buffer.data();
- const u8* chroma_b_src = frame->GetData(1);
- const u8* chroma_r_src = frame->GetData(2);
- for (std::size_t y = 0; y < half_height; ++y) {
- const std::size_t src = y * half_stride;
- const std::size_t dst = y * aligned_width;
- for (std::size_t x = 0; x < half_width; ++x) {
- chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x];
- chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x];
+ surface_width = std::min(surface_width, out_luma_width);
+ surface_height = std::min(surface_height, out_luma_height);
+
+ [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
+ for (u32 y = 0; y < surface_height; ++y) {
+ const auto src_luma = y * surface_stride;
+ const auto dst_luma = y * out_luma_stride;
+ const auto src_chroma = y * surface_stride;
+ const auto dst_chroma = (y / 2) * out_chroma_stride;
+ for (u32 x = 0; x < surface_width; x += 2) {
+ out_luma[dst_luma + x + 0] =
+ static_cast<u8>(output_surface[src_luma + x + 0].r >> 2);
+ out_luma[dst_luma + x + 1] =
+ static_cast<u8>(output_surface[src_luma + x + 1].r >> 2);
+ out_chroma[dst_chroma + x + 0] =
+ static_cast<u8>(output_surface[src_chroma + x].g >> 2);
+ out_chroma[dst_chroma + x + 1] =
+ static_cast<u8>(output_surface[src_chroma + x].b >> 2);
}
}
- break;
- }
- case AV_PIX_FMT_NV12: {
- // Frame from VA-API hardware
- // This is already interleaved so just copy
- const u8* chroma_src = frame->GetData(1);
- for (std::size_t y = 0; y < half_height; ++y) {
- const std::size_t src = y * stride;
- const std::size_t dst = y * aligned_width;
- std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width);
+ };
+
+ auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
+#if defined(ARCHITECTURE_x86_64)
+ if (!has_sse41) {
+ DecodeLinear(out_luma, out_chroma);
+ return;
+ }
+#endif
+
+#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
+ // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
+ const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
+
+ const auto sse_aligned_width = Common::AlignDown(surface_width, 16);
+
+ for (u32 y = 0; y < surface_height; ++y) {
+ const auto src = y * surface_stride;
+ const auto dst_luma = y * out_luma_stride;
+ const auto dst_chroma = (y / 2) * out_chroma_stride;
+ u32 x = 0;
+ for (; x < sse_aligned_width; x += 16) {
+ // clang-format off
+ // Prefetch the next cache lines, 2 per iteration
+ _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
+ _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
+
+ // Load the 64-bit pixels, 2 per variable.
+ auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
+ auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
+ auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
+ auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
+ auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
+ auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
+ auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
+ auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
+
+ // Split out the luma of each pixel using the luma_mask above.
+ // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
+ // ->
+ // l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1]
+ auto l01 = _mm_and_si128(pixel01, luma_mask);
+ auto l23 = _mm_and_si128(pixel23, luma_mask);
+ auto l45 = _mm_and_si128(pixel45, luma_mask);
+ auto l67 = _mm_and_si128(pixel67, luma_mask);
+ auto l89 = _mm_and_si128(pixel89, luma_mask);
+ auto l1011 = _mm_and_si128(pixel1011, luma_mask);
+ auto l1213 = _mm_and_si128(pixel1213, luma_mask);
+ auto l1415 = _mm_and_si128(pixel1415, luma_mask);
+
+ // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
+ // l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1]
+ // l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3]
+ // ->
+ // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1]
+ auto l0123 = _mm_packus_epi32(l01, l23);
+ auto l4567 = _mm_packus_epi32(l45, l67);
+ auto l891011 = _mm_packus_epi32(l89, l1011);
+ auto l12131415 = _mm_packus_epi32(l1213, l1415);
+
+ // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
+ // l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1]
+ // l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5]
+ // ->
+ // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
+ auto luma_lo = _mm_packus_epi32(l0123, l4567);
+ auto luma_hi = _mm_packus_epi32(l891011, l12131415);
+
+ // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
+ // and bringing the range back to 8-bit.
+ luma_lo = _mm_srli_epi16(luma_lo, 2);
+ luma_hi = _mm_srli_epi16(luma_hi, 2);
+
+ // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register.
+ // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
+ // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9]
+ // ->
+ // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1]
+ auto luma = _mm_packus_epi16(luma_lo, luma_hi);
+
+ // Store the 16 bytes of luma
+ _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma);
+
+ if (y % 2 == 0) {
+ // Chroma, done every other line as it's half the height of luma.
+
+ // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma.
+ // We can do this instead of &'ing a mask and then shifting.
+ // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
+ // ->
+ // c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1]
+ auto c01 = _mm_srli_si128(pixel01, 2);
+ auto c23 = _mm_srli_si128(pixel23, 2);
+ auto c45 = _mm_srli_si128(pixel45, 2);
+ auto c67 = _mm_srli_si128(pixel67, 2);
+ auto c89 = _mm_srli_si128(pixel89, 2);
+ auto c1011 = _mm_srli_si128(pixel1011, 2);
+ auto c1213 = _mm_srli_si128(pixel1213, 2);
+ auto c1415 = _mm_srli_si128(pixel1415, 2);
+
+ // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register.
+ // This has the effect of skipping every other chroma value horitonally,
+ // notice the high pixels UU2/UU4 are skipped.
+ // This is intended as N420 chroma width is half the luma width.
+ // c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1]
+ // c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3]
+ // ->
+ // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1]
+ auto c0123 = _mm_unpacklo_epi32(c01, c23);
+ auto c4567 = _mm_unpacklo_epi32(c45, c67);
+ auto c891011 = _mm_unpacklo_epi32(c89, c1011);
+ auto c12131415 = _mm_unpacklo_epi32(c1213, c1415);
+
+ // Interleave the low 64-bit elements from 2 registers into 1.
+ // c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
+ // c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5]
+ // ->
+ // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
+ auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567);
+ auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415);
+
+ // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
+ // and bringing the range back to 8-bit.
+ chroma_lo = _mm_srli_epi16(chroma_lo, 2);
+ chroma_hi = _mm_srli_epi16(chroma_hi, 2);
+
+ // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register.
+ // chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1]
+ // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9]
+ // ->
+ // chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1]
+ auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi);
+
+ // Store the 16 bytes of chroma.
+ _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma);
+ }
+
+ // clang-format on
+ }
+
+ const auto src_chroma = y * surface_stride;
+ for (; x < surface_width; x += 2) {
+ out_luma[dst_luma + x + 0] = static_cast<u8>(output_surface[src + x + 0].r >> 2);
+ out_luma[dst_luma + x + 1] = static_cast<u8>(output_surface[src + x + 1].r >> 2);
+ out_chroma[dst_chroma + x + 0] =
+ static_cast<u8>(output_surface[src_chroma + x].g >> 2);
+ out_chroma[dst_chroma + x + 1] =
+ static_cast<u8>(output_surface[src_chroma + x].b >> 2);
+ }
+ }
+#else
+ DecodeLinear(out_luma, out_chroma);
+#endif
+ };
+
+ switch (output_surface_config.out_block_kind) {
+ case BLK_KIND::GENERIC_16Bx2: {
+ const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+ const auto out_luma_swizzle_size = Texture::CalculateSize(
+ true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0);
+ const auto out_chroma_swizzle_size = Texture::CalculateSize(
+ true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0);
+
+ LOG_TRACE(
+ HW_GPU,
+ "Writing Y8__V8U8_N420 swizzled frame\n"
+ "\tinput surface {}x{} stride {} size 0x{:X}\n"
+ "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n",
+ "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+ surface_width, surface_height, surface_stride * BytesPerPixel,
+ surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+ out_luma_stride, out_luma_size, block_height, out_luma_swizzle_size, out_chroma_width,
+ out_chroma_height, out_chroma_stride, out_chroma_size, block_height,
+ out_chroma_swizzle_size);
+
+ luma_scratch.resize_destructive(out_luma_size);
+ chroma_scratch.resize_destructive(out_chroma_size);
+
+ Decode(luma_scratch, chroma_scratch);
+
+ Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+ memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size,
+ &swizzle_scratch);
+
+ if (block_height == 1) {
+ SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
+ out_luma_height);
+ } else {
+ Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
+ out_luma_height, 1, block_height, 0, 1);
}
+
+ Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite>
+ out_chroma(memory_manager, regs.output_surface.chroma_u.Address(),
+ out_chroma_swizzle_size, &swizzle_scratch);
+
+ if (block_height == 1) {
+ SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride,
+ out_chroma_height);
+ } else {
+ Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width,
+ out_chroma_height, 1, block_height, 0, 1);
+ }
+ } break;
+ case BLK_KIND::PITCH: {
+ LOG_TRACE(
+ HW_GPU,
+ "Writing Y8__V8U8_N420 swizzled frame\n"
+ "\tinput surface {}x{} stride {} size 0x{:X}\n"
+ "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n",
+ "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+ surface_width, surface_height, surface_stride * BytesPerPixel,
+ surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+ out_luma_stride, out_luma_size, out_chroma_width, out_chroma_height, out_chroma_stride,
+ out_chroma_size);
+
+ // Unfortunately due to a driver bug or game bug, the chroma address can be not
+ // appropriately spaced from the luma, so the luma of size out_stride * height runs into the
+ // top of the chroma buffer. Unfortunately that removes an optimisation here where we could
+ // create guest spans and decode into game memory directly to avoid the memory copy from
+ // scratch to game. Due to this bug, we must write the luma first, and then the chroma
+ // afterwards to re-overwrite the luma being too large.
+ luma_scratch.resize_destructive(out_luma_size);
+ chroma_scratch.resize_destructive(out_chroma_size);
+
+ Decode(luma_scratch, chroma_scratch);
+
+ memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(),
+ out_luma_size);
+ memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(),
+ out_chroma_size);
+ } break;
+ default:
+ UNREACHABLE();
break;
}
+}
+
+template <VideoPixelFormat Format>
+void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
+ constexpr u32 BytesPerPixel = 4;
+
+ auto surface_width{output_surface_config.out_surface_width + 1};
+ auto surface_height{output_surface_config.out_surface_height + 1};
+ const auto surface_stride{surface_width};
+
+ const auto out_luma_width = output_surface_config.out_luma_width + 1;
+ const auto out_luma_height = output_surface_config.out_luma_height + 1;
+ const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10);
+ const auto out_luma_size = out_luma_height * out_luma_stride;
+
+ surface_width = std::min(surface_width, out_luma_width);
+ surface_height = std::min(surface_height, out_luma_height);
+
+ [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_buffer) {
+ for (u32 y = 0; y < surface_height; y++) {
+ const auto src = y * surface_stride;
+ const auto dst = y * out_luma_stride;
+ for (u32 x = 0; x < surface_width; x++) {
+ if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
+ out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].b >> 2);
+ out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+ out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].r >> 2);
+ out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+ } else {
+ out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].r >> 2);
+ out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+ out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].b >> 2);
+ out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+ }
+ }
+ }
+ };
+
+ auto Decode = [&](std::span<u8> out_buffer) {
+#if defined(ARCHITECTURE_x86_64)
+ if (!has_sse41) {
+ DecodeLinear(out_buffer);
+ return;
+ }
+#endif
+
+#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
+ constexpr size_t SseAlignment = 16;
+ const auto sse_aligned_width = Common::AlignDown(surface_width, SseAlignment);
+
+ for (u32 y = 0; y < surface_height; y++) {
+ const auto src = y * surface_stride;
+ const auto dst = y * out_luma_stride;
+ u32 x = 0;
+ for (; x < sse_aligned_width; x += SseAlignment) {
+ // clang-format off
+ // Prefetch the next 2 cache lines
+ _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
+ _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
+
+ // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g
+ // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR
+ auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
+ auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
+ auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
+ auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
+ auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
+ auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
+ auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
+ auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
+
+ // Right-shift the channels by 16 to un-do the left shit on read and bring the range
+ // back to 8-bit.
+ pixel01 = _mm_srli_epi16(pixel01, 2);
+ pixel23 = _mm_srli_epi16(pixel23, 2);
+ pixel45 = _mm_srli_epi16(pixel45, 2);
+ pixel67 = _mm_srli_epi16(pixel67, 2);
+ pixel89 = _mm_srli_epi16(pixel89, 2);
+ pixel1011 = _mm_srli_epi16(pixel1011, 2);
+ pixel1213 = _mm_srli_epi16(pixel1213, 2);
+ pixel1415 = _mm_srli_epi16(pixel1415, 2);
+
+ // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register.
+ // pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+ // pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3]
+ // ->
+ // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1]
+ auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23);
+ auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67);
+ auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011);
+ auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415);
+
+ if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
+ const auto shuffle =
+ _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);
+
+ // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle.
+ // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1]
+ // ->
+ // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1]
+ pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle);
+ pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle);
+ pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle);
+ pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle);
+ }
+
+ // Store the pixels
+ _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo);
+ _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi);
+ _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo);
+ _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi);
+
+ // clang-format on
+ }
+
+ for (; x < surface_width; x++) {
+ if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
+ out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].b >> 2);
+ out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+ out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].r >> 2);
+ out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+ } else {
+ out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].r >> 2);
+ out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+ out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].b >> 2);
+ out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+ }
+ }
+ }
+#else
+ DecodeLinear(out_buffer);
+#endif
+ };
+
+ switch (output_surface_config.out_block_kind) {
+ case BLK_KIND::GENERIC_16Bx2: {
+ const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+ const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width,
+ out_luma_height, 1, block_height, 0);
+
+ LOG_TRACE(
+ HW_GPU,
+ "Writing ABGR swizzled frame\n"
+ "\tinput surface {}x{} stride {} size 0x{:X}\n"
+ "\toutput surface {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+ surface_width, surface_height, surface_stride * BytesPerPixel,
+ surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+ out_luma_stride, out_luma_size, block_height, out_swizzle_size);
+
+ luma_scratch.resize_destructive(out_luma_size);
+
+ Decode(luma_scratch);
+
+ Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+ memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch);
+
+ if (block_height == 1) {
+ SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
+ out_luma_height);
+ } else {
+ Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
+ out_luma_height, 1, block_height, 0, 1);
+ }
+
+ } break;
+ case BLK_KIND::PITCH: {
+ LOG_TRACE(HW_GPU,
+ "Writing ABGR pitch frame\n"
+ "\tinput surface {}x{} stride {} size 0x{:X}"
+ "\toutput surface {}x{} stride {} size 0x{:X}",
+ surface_width, surface_height, surface_stride,
+ surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+ out_luma_stride, out_luma_size);
+
+ luma_scratch.resize_destructive(out_luma_size);
+
+ Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+ memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch);
+
+ Decode(out_luma);
+ } break;
default:
- ASSERT(false);
+ UNREACHABLE();
break;
}
- host1x.GMMU().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
- chroma_buffer.size());
}
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h
index 6c868f062..e7600941a 100644
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -3,65 +3,646 @@
#pragma once
+#include <condition_variable>
+#include <functional>
#include <memory>
+#include <mutex>
+#include <thread>
#include "common/common_types.h"
#include "common/scratch_buffer.h"
+#include "video_core/cdma_pusher.h"
-struct SwsContext;
+namespace Tegra::Host1x {
+class Host1x;
+class Nvdec;
-namespace Tegra {
+struct Pixel {
+ u16 r;
+ u16 g;
+ u16 b;
+ u16 a;
+};
-namespace Host1x {
+// One underscore represents separate pixels.
+// Double underscore represents separate planes.
+// _N represents chroma subsampling, not a separate pixel.
+enum class VideoPixelFormat : u32 {
+ A8 = 0,
+ L8 = 1,
+ A4L4 = 2,
+ L4A4 = 3,
+ R8 = 4,
+ A8L8 = 5,
+ L8A8 = 6,
+ R8G8 = 7,
+ G8R8 = 8,
+ B5G6R5 = 9,
+ R5G6B5 = 10,
+ B6G5R5 = 11,
+ R5G5B6 = 12,
+ A1B5G5R5 = 13,
+ A1R5G5B5 = 14,
+ B5G5R5A1 = 15,
+ R5G5B5A1 = 16,
+ A5B5G5R1 = 17,
+ A5R1G5B5 = 18,
+ B5G5R1A5 = 19,
+ R1G5B5A5 = 20,
+ X1B5G5R5 = 21,
+ X1R5G5B5 = 22,
+ B5G5R5X1 = 23,
+ R5G5B5X1 = 24,
+ A4B4G5R4 = 25,
+ A4R4G4B4 = 26,
+ B4G4R4A4 = 27,
+ R4G4B4A4 = 28,
+ B8G8R8 = 29,
+ R8G8B8 = 30,
+ A8B8G8R8 = 31,
+ A8R8G8B8 = 32,
+ B8G8R8A8 = 33,
+ R8G8B8A8 = 34,
+ X8B8G8R8 = 35,
+ X8R8G8B8 = 36,
+ B8G8R8X8 = 37,
+ R8G8B8X8 = 38,
+ A8B10G10R10 = 39,
+ A2R10G10B10 = 40,
+ B10G10R10A2 = 41,
+ R10G10B10A2 = 42,
+ A4P4 = 43,
+ P4A4 = 44,
+ P8A8 = 45,
+ A8P8 = 46,
+ P8 = 47,
+ P1 = 48,
+ U8V8 = 49,
+ V8U8 = 50,
+ A8Y8U8V8 = 51,
+ V8U8Y8A8 = 52,
+ Y8U8V8 = 53,
+ Y8V8U8 = 54,
+ U8V8Y8 = 55,
+ V8U8Y8 = 56,
+ Y8U8_Y8V8 = 57,
+ Y8V8_Y8U8 = 58,
+ U8Y8_V8Y8 = 59,
+ V8Y8_U8Y8 = 60,
+ Y8__U8V8_N444 = 61,
+ Y8__V8U8_N444 = 62,
+ Y8__U8V8_N422 = 63,
+ Y8__V8U8_N422 = 64,
+ Y8__U8V8_N422R = 65,
+ Y8__V8U8_N422R = 66,
+ Y8__U8V8_N420 = 67,
+ Y8__V8U8_N420 = 68,
+ Y8__U8__V8_N444 = 69,
+ Y8__U8__V8_N422 = 70,
+ Y8__U8__V8_N422R = 71,
+ Y8__U8__V8_N420 = 72,
+ U8 = 73,
+ V8 = 74,
+};
-class Host1x;
-class Nvdec;
-union VicConfig;
+struct Offset {
+ constexpr u32 Address() const noexcept {
+ return offset << 8;
+ }
+
+private:
+ u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
+
+struct PlaneOffsets {
+ Offset luma;
+ Offset chroma_u;
+ Offset chroma_v;
+};
+static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!");
+
+enum SurfaceIndex : u32 {
+ Current = 0,
+ Previous = 1,
+ Next = 2,
+ NextNoiseReduced = 3,
+ CurrentMotion = 4,
+ PreviousMotion = 5,
+ PreviousPreviousMotion = 6,
+ CombinedMotion = 7,
+};
+
+enum class DXVAHD_ALPHA_FILL_MODE : u32 {
+ OPAQUE = 0,
+ BACKGROUND = 1,
+ DESTINATION = 2,
+ SOURCE_STREAM = 3,
+ COMPOSITED = 4,
+ SOURCE_ALPHA = 5,
+};
+
+enum class DXVAHD_FRAME_FORMAT : u64 {
+ PROGRESSIVE = 0,
+ INTERLACED_TOP_FIELD_FIRST = 1,
+ INTERLACED_BOTTOM_FIELD_FIRST = 2,
+ TOP_FIELD = 3,
+ BOTTOM_FIELD = 4,
+ SUBPIC_PROGRESSIVE = 5,
+ SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6,
+ SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7,
+ SUBPIC_TOP_FIELD = 8,
+ SUBPIC_BOTTOM_FIELD = 9,
+ TOP_FIELD_CHROMA_BOTTOM = 10,
+ BOTTOM_FIELD_CHROMA_TOP = 11,
+ SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12,
+ SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13,
+};
+
+enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 {
+ WEAVE = 0,
+ BOB_FIELD = 1,
+ BOB = 2,
+ NEWBOB = 3,
+ DISI1 = 4,
+ WEAVE_LUMA_BOB_FIELD_CHROMA = 5,
+ MAX = 0xF,
+};
+
+enum class BLK_KIND {
+ PITCH = 0,
+ GENERIC_16Bx2 = 1,
+ // These are unsupported in the vic
+ BL_NAIVE = 2,
+ BL_KEPLER_XBAR_RAW = 3,
+ VP2_TILED = 15,
+};
+
+enum class BLEND_SRCFACTC : u32 {
+ K1 = 0,
+ K1_TIMES_DST = 1,
+ NEG_K1_TIMES_DST = 2,
+ K1_TIMES_SRC = 3,
+ ZERO = 4,
+};
+
+enum class BLEND_DSTFACTC : u32 {
+ K1 = 0,
+ K2 = 1,
+ K1_TIMES_DST = 2,
+ NEG_K1_TIMES_DST = 3,
+ NEG_K1_TIMES_SRC = 4,
+ ZERO = 5,
+ ONE = 6,
+};
+
+enum class BLEND_SRCFACTA : u32 {
+ K1 = 0,
+ K2 = 1,
+ NEG_K1_TIMES_DST = 2,
+ ZERO = 3,
+ MAX = 7,
+};
+
+enum class BLEND_DSTFACTA : u32 {
+ K2 = 0,
+ NEG_K1_TIMES_SRC = 1,
+ ZERO = 2,
+ ONE = 3,
+ MAX = 7,
+};
+
+struct PipeConfig {
+ union {
+ BitField<0, 11, u32> downsample_horiz;
+ BitField<11, 5, u32> reserved0;
+ BitField<16, 11, u32> downsample_vert;
+ BitField<27, 5, u32> reserved1;
+ };
+ u32 reserved2;
+ u32 reserved3;
+ u32 reserved4;
+};
+static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!");
+
+struct OutputConfig {
+ union {
+ BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode;
+ BitField<3, 3, u64> alpha_fill_slot;
+ BitField<6, 10, u64> background_a;
+ BitField<16, 10, u64> background_r;
+ BitField<26, 10, u64> background_g;
+ BitField<36, 10, u64> background_b;
+ BitField<46, 2, u64> regamma_mode;
+ BitField<48, 1, u64> output_flip_x;
+ BitField<49, 1, u64> output_flip_y;
+ BitField<50, 1, u64> output_transpose;
+ BitField<51, 1, u64> reserved1;
+ BitField<52, 12, u64> reserved2;
+ };
+ union {
+ BitField<0, 14, u32> target_rect_left;
+ BitField<14, 2, u32> reserved3;
+ BitField<16, 14, u32> target_rect_right;
+ BitField<30, 2, u32> reserved4;
+ };
+ union {
+ BitField<0, 14, u32> target_rect_top;
+ BitField<14, 2, u32> reserved5;
+ BitField<16, 14, u32> target_rect_bottom;
+ BitField<30, 2, u32> reserved6;
+ };
+};
+static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!");
+
+struct OutputSurfaceConfig {
+ union {
+ BitField<0, 7, VideoPixelFormat> out_pixel_format;
+ BitField<7, 2, u32> out_chroma_loc_horiz;
+ BitField<9, 2, u32> out_chroma_loc_vert;
+ BitField<11, 4, BLK_KIND> out_block_kind;
+ BitField<15, 4, u32> out_block_height; // in gobs, log2
+ BitField<19, 3, u32> reserved0;
+ BitField<22, 10, u32> reserved1;
+ };
+ union {
+ BitField<0, 14, u32> out_surface_width; // - 1
+ BitField<14, 14, u32> out_surface_height; // - 1
+ BitField<28, 4, u32> reserved2;
+ };
+ union {
+ BitField<0, 14, u32> out_luma_width; // - 1
+ BitField<14, 14, u32> out_luma_height; // - 1
+ BitField<28, 4, u32> reserved3;
+ };
+ union {
+ BitField<0, 14, u32> out_chroma_width; // - 1
+ BitField<14, 14, u32> out_chroma_height; // - 1
+ BitField<28, 4, u32> reserved4;
+ };
+};
+static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!");
+
+struct MatrixStruct {
+ union {
+ BitField<0, 20, s64> matrix_coeff00; // (0,0) of 4x3 conversion matrix
+ BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix
+ BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix
+ BitField<60, 4, u64> matrix_r_shift;
+ };
+ union {
+ BitField<0, 20, s64> matrix_coeff01; // (0,1) of 4x3 conversion matrix
+ BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix
+ BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix
+ BitField<60, 3, u64> reserved0;
+ BitField<63, 1, u64> matrix_enable;
+ };
+ union {
+ BitField<0, 20, s64> matrix_coeff02; // (0,2) of 4x3 conversion matrix
+ BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix
+ BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix
+ BitField<60, 4, u64> reserved1;
+ };
+ union {
+ BitField<0, 20, s64> matrix_coeff03; // (0,3) of 4x3 conversion matrix
+ BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix
+ BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix
+ BitField<60, 4, u64> reserved2;
+ };
+};
+static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!");
+
+struct ClearRectStruct {
+ union {
+ BitField<0, 14, u32> clear_rect0_left;
+ BitField<14, 2, u32> reserved0;
+ BitField<16, 14, u32> clear_rect0_right;
+ BitField<30, 2, u32> reserved1;
+ };
+ union {
+ BitField<0, 14, u32> clear_rect0_top;
+ BitField<14, 2, u32> reserved2;
+ BitField<16, 14, u32> clear_rect0_bottom;
+ BitField<30, 2, u32> reserved3;
+ };
+ union {
+ BitField<0, 14, u32> clear_rect1_left;
+ BitField<14, 2, u32> reserved4;
+ BitField<16, 14, u32> clear_rect1_right;
+ BitField<30, 2, u32> reserved5;
+ };
+ union {
+ BitField<0, 14, u32> clear_rect1_top;
+ BitField<14, 2, u32> reserved6;
+ BitField<16, 14, u32> clear_rect1_bottom;
+ BitField<30, 2, u32> reserved7;
+ };
+};
+static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!");
+
+struct SlotConfig {
+ union {
+ BitField<0, 1, u64> slot_enable;
+ BitField<1, 1, u64> denoise;
+ BitField<2, 1, u64> advanced_denoise;
+ BitField<3, 1, u64> cadence_detect;
+ BitField<4, 1, u64> motion_map;
+ BitField<5, 1, u64> motion_map_capture;
+ BitField<6, 1, u64> is_even;
+ BitField<7, 1, u64> chroma_even;
+ // fetch control struct
+ BitField<8, 1, u64> current_field_enable;
+ BitField<9, 1, u64> prev_field_enable;
+ BitField<10, 1, u64> next_field_enable;
+ BitField<11, 1, u64> next_nr_field_enable; // noise reduction
+ BitField<12, 1, u64> current_motion_field_enable;
+ BitField<13, 1, u64> prev_motion_field_enable;
+ BitField<14, 1, u64> prev_prev_motion_field_enable;
+ BitField<15, 1, u64> combined_motion_field_enable;
+
+ BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format;
+ BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap
+ BitField<22, 2, u64> filter_length_x;
+ BitField<24, 12, u64> panoramic;
+ BitField<36, 22, u64> reserved1;
+ BitField<58, 6, u64> detail_filter_clamp;
+ };
+ union {
+ BitField<0, 10, u64> filter_noise;
+ BitField<10, 10, u64> filter_detail;
+ BitField<20, 10, u64> chroma_noise;
+ BitField<30, 10, u64> chroma_detail;
+ BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode;
+ BitField<44, 3, u64> motion_accumulation_weight;
+ BitField<47, 11, u64> noise_iir;
+ BitField<58, 4, u64> light_level;
+ BitField<62, 2, u64> reserved4;
+ };
+ union {
+ BitField<0, 10, u64> soft_clamp_low;
+ BitField<10, 10, u64> soft_clamp_high;
+ BitField<20, 3, u64> reserved5;
+ BitField<23, 9, u64> reserved6;
+ BitField<32, 10, u64> planar_alpha;
+ BitField<42, 1, u64> constant_alpha;
+ BitField<43, 3, u64> stereo_interleave;
+ BitField<46, 1, u64> clip_enabled;
+ BitField<47, 8, u64> clear_rect_mask;
+ BitField<55, 2, u64> degamma_mode;
+ BitField<57, 1, u64> reserved7;
+ BitField<58, 1, u64> decompress_enable;
+ BitField<59, 5, u64> reserved9;
+ };
+ union {
+ BitField<0, 8, u64> decompress_ctb_count;
+ BitField<8, 32, u64> decompress_zbc_count;
+ BitField<40, 24, u64> reserved12;
+ };
+ union {
+ BitField<0, 30, u64> source_rect_left;
+ BitField<30, 2, u64> reserved14;
+ BitField<32, 30, u64> source_rect_right;
+ BitField<62, 2, u64> reserved15;
+ };
+ union {
+ BitField<0, 30, u64> source_rect_top;
+ BitField<30, 2, u64> reserved16;
+ BitField<32, 30, u64> source_rect_bottom;
+ BitField<62, 2, u64> reserved17;
+ };
+ union {
+ BitField<0, 14, u64> dest_rect_left;
+ BitField<14, 2, u64> reserved18;
+ BitField<16, 14, u64> dest_rect_right;
+ BitField<30, 2, u64> reserved19;
+ BitField<32, 14, u64> dest_rect_top;
+ BitField<46, 2, u64> reserved20;
+ BitField<48, 14, u64> dest_rect_bottom;
+ BitField<62, 2, u64> reserved21;
+ };
+ u32 reserved22;
+ u32 reserved23;
+};
+static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!");
+
+struct SlotSurfaceConfig {
+ union {
+ BitField<0, 7, VideoPixelFormat> slot_pixel_format;
+ BitField<7, 2, u32> slot_chroma_loc_horiz;
+ BitField<9, 2, u32> slot_chroma_loc_vert;
+ BitField<11, 4, u32> slot_block_kind;
+ BitField<15, 4, u32> slot_block_height;
+ BitField<19, 3, u32> slot_cache_width;
+ BitField<22, 10, u32> reserved0;
+ };
+ union {
+ BitField<0, 14, u32> slot_surface_width; // - 1
+ BitField<14, 14, u32> slot_surface_height; // - 1
+ BitField<28, 4, u32> reserved1;
+ };
+ union {
+ BitField<0, 14, u32> slot_luma_width; // padded, - 1
+ BitField<14, 14, u32> slot_luma_height; // padded, - 1
+ BitField<28, 4, u32> reserved2;
+ };
+ union {
+ BitField<0, 14, u32> slot_chroma_width; // padded, - 1
+ BitField<14, 14, u32> slot_chroma_height; // padded, - 1
+ BitField<28, 4, u32> reserved3;
+ };
+};
+static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!");
-class Vic {
+struct LumaKeyStruct {
+ union {
+ BitField<0, 20, u64> luma_coeff0; // (0) of 4x1 conversion matrix, S12.8 format
+ BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format
+ BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format
+ BitField<60, 4, u64> luma_r_shift;
+ };
+ union {
+ BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format
+ BitField<20, 10, u64> luma_key_lower;
+ BitField<30, 10, u64> luma_key_upper;
+ BitField<40, 1, u64> luma_key_enabled;
+ BitField<41, 2, u64> reserved0;
+ BitField<43, 21, u64> reserved1;
+ };
+};
+static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!");
+
+struct BlendingSlotStruct {
+ union {
+ BitField<0, 10, u32> alpha_k1;
+ BitField<10, 6, u32> reserved0;
+ BitField<16, 10, u32> alpha_k2;
+ BitField<26, 6, u32> reserved1;
+ };
+ union {
+ BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select;
+ BitField<3, 1, u32> reserved2;
+ BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select;
+ BitField<7, 1, u32> reserved3;
+ BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select;
+ BitField<11, 1, u32> reserved4;
+ BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select;
+ BitField<15, 1, u32> reserved5;
+ BitField<16, 4, u32> reserved6;
+ BitField<20, 4, u32> reserved7;
+ BitField<24, 4, u32> reserved8;
+ BitField<28, 4, u32> reserved9;
+ };
+ union {
+ BitField<0, 2, u32> reserved10;
+ BitField<2, 10, u32> override_r;
+ BitField<12, 10, u32> override_g;
+ BitField<22, 10, u32> override_b;
+ };
+ union {
+ BitField<0, 10, u32> override_a;
+ BitField<10, 2, u32> reserved11;
+ BitField<12, 1, u32> use_override_r;
+ BitField<13, 1, u32> use_override_g;
+ BitField<14, 1, u32> use_override_b;
+ BitField<15, 1, u32> use_override_a;
+ BitField<16, 1, u32> mask_r;
+ BitField<17, 1, u32> mask_g;
+ BitField<18, 1, u32> mask_b;
+ BitField<19, 1, u32> mask_a;
+ BitField<20, 12, u32> reserved12;
+ };
+};
+static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!");
+
+struct SlotStruct {
+ SlotConfig config;
+ SlotSurfaceConfig surface_config;
+ LumaKeyStruct luma_key;
+ MatrixStruct color_matrix;
+ MatrixStruct gamut_matrix;
+ BlendingSlotStruct blending;
+};
+static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!");
+
+struct ConfigStruct {
+ PipeConfig pipe_config;
+ OutputConfig output_config;
+ OutputSurfaceConfig output_surface_config;
+ MatrixStruct out_color_matrix;
+ std::array<ClearRectStruct, 4> clear_rects;
+ std::array<SlotStruct, 8> slot_structs;
+};
+static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_config) == 0x10,
+ "output_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20,
+ "output_surface_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30,
+ "out_color_matrix is in the wrong place!");
+static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!");
+static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!");
+static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!");
+
+struct VicRegisters {
+ static constexpr std::size_t NUM_REGS = 0x446;
+
+ union {
+ struct {
+ INSERT_PADDING_WORDS_NOINIT(0xC0);
+ u32 execute;
+ INSERT_PADDING_WORDS_NOINIT(0x3F);
+ std::array<std::array<PlaneOffsets, 8>, 8> surfaces;
+ u32 picture_index;
+ u32 control_params;
+ Offset config_struct_offset;
+ Offset filter_struct_offset;
+ Offset palette_offset;
+ Offset hist_offset;
+ u32 context_id;
+ u32 fce_ucode_size;
+ PlaneOffsets output_surface;
+ Offset fce_ucode_offset;
+ INSERT_PADDING_WORDS_NOINIT(0x4);
+ std::array<u32, 8> slot_context_ids;
+ std::array<Offset, 8> comp_tag_buffer_offsets;
+ std::array<Offset, 8> history_buffer_offset;
+ INSERT_PADDING_WORDS_NOINIT(0x25D);
+ u32 pm_trigger_end;
+ };
+ std::array<u32, NUM_REGS> reg_array;
+ };
+};
+static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!");
+static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!");
+static_assert(offsetof(VicRegisters, picture_index) == 0x700,
+ "picture_index is in the wrong place!");
+static_assert(offsetof(VicRegisters, control_params) == 0x704,
+ "control_params is in the wrong place!");
+static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708,
+ "config_struct_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, output_surface) == 0x720,
+ "output_surface is in the wrong place!");
+static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740,
+ "slot_context_ids is in the wrong place!");
+static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780,
+ "history_buffer_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114,
+ "pm_trigger_end is in the wrong place!");
+static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!");
+
+class Vic final : public CDmaPusher {
public:
enum class Method : u32 {
- Execute = 0xc0,
- SetControlParams = 0x1c1,
- SetConfigStructOffset = 0x1c2,
- SetOutputSurfaceLumaOffset = 0x1c8,
- SetOutputSurfaceChromaOffset = 0x1c9,
- SetOutputSurfaceChromaUnusedOffset = 0x1ca
+ Execute = offsetof(VicRegisters, execute),
+ SetControlParams = offsetof(VicRegisters, control_params),
+ SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset),
+ SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma),
+ SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u),
+ SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v)
};
- explicit Vic(Host1x& host1x, std::shared_ptr<Nvdec> nvdec_processor);
-
+ explicit Vic(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue);
~Vic();
/// Write to the device state.
- void ProcessMethod(Method method, u32 argument);
+ void ProcessMethod(u32 method, u32 arg) override;
private:
void Execute();
- void WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+ void Blend(const ConfigStruct& config, const SlotStruct& slot);
- void WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+ template <bool Planar, bool Interlaced = false>
+ void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame);
+ template <bool Planar, bool TopField>
+ void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame);
- Host1x& host1x;
- std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
+ template <bool Planar>
+ void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+ std::shared_ptr<const FFmpeg::Frame> frame);
- /// Avoid reallocation of the following buffers every frame, as their
- /// size does not change during a stream
- using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
- AVMallocPtr converted_frame_buffer;
- Common::ScratchBuffer<u8> luma_buffer;
- Common::ScratchBuffer<u8> chroma_buffer;
+ void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);
- GPUVAddr config_struct_address{};
- GPUVAddr output_surface_luma_address{};
- GPUVAddr output_surface_chroma_address{};
+ template <VideoPixelFormat Format>
+ void WriteABGR(const OutputSurfaceConfig& output_surface_config);
- SwsContext* scaler_ctx{};
- s32 scaler_width{};
- s32 scaler_height{};
-};
+ s32 id;
+ s32 nvdec_id{-1};
+ u32 syncpoint;
+
+ VicRegisters regs{};
+ FrameQueue& frame_queue;
-} // namespace Host1x
+ const bool has_sse41{false};
+
+ Common::ScratchBuffer<Pixel> output_surface;
+ Common::ScratchBuffer<Pixel> slot_surface;
+ Common::ScratchBuffer<u8> luma_scratch;
+ Common::ScratchBuffer<u8> chroma_scratch;
+ Common::ScratchBuffer<u8> swizzle_scratch;
+};
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index ac7c1472a..448624aa9 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -43,6 +43,8 @@ public:
u64 big_page_bits_ = 16, u64 page_bits_ = 12);
~MemoryManager();
+ static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
size_t GetID() const {
return unique_identifier;
}