From eb67a45ca82bc01ac843c853fd3c17f2a90e0250 Mon Sep 17 00:00:00 2001
From: ameerj <aj662@drexel.edu>
Date: Mon, 26 Oct 2020 23:07:36 -0400
Subject: [PATCH] video_core: NVDEC Implementation

This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library.

The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data.

To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library.

Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header.

Async GPU is not properly implemented at the moment.

Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com>
---
 CMakeLists.txt                                |   14 +
 CMakeModules/CopyYuzuFFmpegDeps.cmake         |   10 +
 externals/find-modules/FindFFmpeg.cmake       |  100 ++
 src/common/CMakeLists.txt                     |    2 +
 src/common/stream.cpp                         |   47 +
 src/common/stream.h                           |   50 +
 src/core/CMakeLists.txt                       |    2 +
 .../service/nvdrv/devices/nvhost_nvdec.cpp    |  100 +-
 .../hle/service/nvdrv/devices/nvhost_nvdec.h  |   71 +-
 .../nvdrv/devices/nvhost_nvdec_common.cpp     |  234 ++++
 .../nvdrv/devices/nvhost_nvdec_common.h       |  168 +++
 .../hle/service/nvdrv/devices/nvhost_vic.cpp  |   90 +-
 .../hle/service/nvdrv/devices/nvhost_vic.h    |   88 +-
 src/core/hle/service/nvdrv/devices/nvmap.h    |    1 +
 src/core/hle/service/nvdrv/nvdrv.cpp          |    4 +-
 src/core/settings.cpp                         |    2 +
 src/core/settings.h                           |    1 +
 src/core/telemetry_session.cpp                |    2 +
 src/video_core/CMakeLists.txt                 |   26 +
 src/video_core/cdma_pusher.cpp                |  171 +++
 src/video_core/cdma_pusher.h                  |  138 +++
 .../command_classes/codecs/codec.cpp          |  114 ++
 src/video_core/command_classes/codecs/codec.h |   68 ++
 .../command_classes/codecs/h264.cpp           |  276 +++++
 src/video_core/command_classes/codecs/h264.h  |  130 +++
 src/video_core/command_classes/codecs/vp9.cpp | 1010 +++++++++++++++++
 src/video_core/command_classes/codecs/vp9.h   |  216 ++++
 .../command_classes/codecs/vp9_types.h        |  369 ++++++
 src/video_core/command_classes/host1x.cpp     |   39 +
 src/video_core/command_classes/host1x.h       |   78 ++
 src/video_core/command_classes/nvdec.cpp      |   56 +
 src/video_core/command_classes/nvdec.h        |   39 +
 src/video_core/command_classes/nvdec_common.h |   48 +
 .../command_classes/sync_manager.cpp          |   60 +
 src/video_core/command_classes/sync_manager.h |   64 ++
 src/video_core/command_classes/vic.cpp        |  180 +++
 src/video_core/command_classes/vic.h          |  110 ++
 src/video_core/gpu.cpp                        |   11 +-
 src/video_core/gpu.h                          |   23 +-
 src/video_core/gpu_asynch.cpp                 |   26 +-
 src/video_core/gpu_asynch.h                   |    3 +-
 src/video_core/gpu_synch.cpp                  |   18 +-
 src/video_core/gpu_synch.h                    |    3 +-
 src/video_core/gpu_thread.cpp                 |   16 +-
 src/video_core/gpu_thread.h                   |   19 +-
 src/video_core/memory_manager.cpp             |   12 +-
 src/video_core/memory_manager.h               |    5 +-
 src/video_core/video_core.cpp                 |    5 +-
 src/yuzu/CMakeLists.txt                       |    2 +
 src/yuzu/configuration/config.cpp             |    4 +
 src/yuzu/configuration/configure_graphics.cpp |   10 +
 src/yuzu/configuration/configure_graphics.h   |    1 +
 src/yuzu/configuration/configure_graphics.ui  |    7 +
 53 files changed, 4033 insertions(+), 310 deletions(-)
 create mode 100644 CMakeModules/CopyYuzuFFmpegDeps.cmake
 create mode 100644 externals/find-modules/FindFFmpeg.cmake
 create mode 100644 src/common/stream.cpp
 create mode 100644 src/common/stream.h
 create mode 100644 src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
 create mode 100644 src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
 create mode 100644 src/video_core/cdma_pusher.cpp
 create mode 100644 src/video_core/cdma_pusher.h
 create mode 100644 src/video_core/command_classes/codecs/codec.cpp
 create mode 100644 src/video_core/command_classes/codecs/codec.h
 create mode 100644 src/video_core/command_classes/codecs/h264.cpp
 create mode 100644 src/video_core/command_classes/codecs/h264.h
 create mode 100644 src/video_core/command_classes/codecs/vp9.cpp
 create mode 100644 src/video_core/command_classes/codecs/vp9.h
 create mode 100644 src/video_core/command_classes/codecs/vp9_types.h
 create mode 100644 src/video_core/command_classes/host1x.cpp
 create mode 100644 src/video_core/command_classes/host1x.h
 create mode 100644 src/video_core/command_classes/nvdec.cpp
 create mode 100644 src/video_core/command_classes/nvdec.h
 create mode 100644 src/video_core/command_classes/nvdec_common.h
 create mode 100644 src/video_core/command_classes/sync_manager.cpp
 create mode 100644 src/video_core/command_classes/sync_manager.h
 create mode 100644 src/video_core/command_classes/vic.cpp
 create mode 100644 src/video_core/command_classes/vic.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45bd03a65..8e9502a97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,6 +263,7 @@ if (CONAN_REQUIRED_LIBS)
         libzip:with_openssl=False
         libzip:enable_windows_crypto=False
     )
+
     conan_check(VERSION 1.24.0 REQUIRED)
     # Add the bincrafters remote
     conan_add_remote(NAME bincrafters
@@ -354,6 +355,19 @@ if (NOT LIBUSB_FOUND)
     set(LIBUSB_LIBRARIES usb)
 endif()
 
+# Use system installed ffmpeg.
+if (NOT MSVC)
+    find_package(FFmpeg REQUIRED)
+else()
+    set(FFMPEG_EXT_NAME "ffmpeg-4.2.1")
+    set(FFMPEG_PATH "${CMAKE_BINARY_DIR}/externals/${FFMPEG_EXT_NAME}")
+    download_bundled_external("ffmpeg/" ${FFMPEG_EXT_NAME} "")
+    set(FFMPEG_FOUND YES)
+    set(FFMPEG_INCLUDE_DIR "${FFMPEG_PATH}/include" CACHE PATH "Path to FFmpeg headers" FORCE)
+    set(FFMPEG_LIBRARY_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg library" FORCE)
+    set(FFMPEG_DLL_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg dll's" FORCE)
+endif()
+
 # Prefer the -pthread flag on Linux.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
diff --git a/CMakeModules/CopyYuzuFFmpegDeps.cmake b/CMakeModules/CopyYuzuFFmpegDeps.cmake
new file mode 100644
index 000000000..cca1eeeab
--- /dev/null
+++ b/CMakeModules/CopyYuzuFFmpegDeps.cmake
@@ -0,0 +1,10 @@
+function(copy_yuzu_FFmpeg_deps target_dir)
+    include(WindowsCopyFiles)
+    set(DLL_DEST "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/")
+    windows_copy_files(${target_dir} ${FFMPEG_DLL_DIR} ${DLL_DEST}
+        avcodec-58.dll
+        avutil-56.dll
+        swresample-3.dll
+        swscale-5.dll
+    )
+endfunction(copy_yuzu_FFmpeg_deps)
diff --git a/externals/find-modules/FindFFmpeg.cmake b/externals/find-modules/FindFFmpeg.cmake
new file mode 100644
index 000000000..77b331e00
--- /dev/null
+++ b/externals/find-modules/FindFFmpeg.cmake
@@ -0,0 +1,100 @@
+# - Try to find ffmpeg libraries (libavcodec, libavformat and libavutil)
+# Once done this will define
+#
+# FFMPEG_FOUND - system has ffmpeg or libav
+# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
+# FFMPEG_LIBRARIES - Link these to use ffmpeg
+# FFMPEG_LIBAVCODEC
+# FFMPEG_LIBAVFORMAT
+# FFMPEG_LIBAVUTIL
+#
+# Copyright (c) 2008 Andreas Schneider <mail@cynapses.org>
+# Modified for other libraries by Lasse Kärkkäinen <tronic>
+# Modified for Hedgewars by Stepik777
+# Modified for FFmpeg-example Tuukka Pasanen 2018
+# Modified for yuzu toastUnlimted 2020
+#
+# Redistribution and use is allowed according to the terms of the New
+# BSD license.
+#
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(FFMPEG
+  FOUND_VAR FFMPEG_FOUND
+  REQUIRED_VARS
+      FFMPEG_LIBRARY
+      FFMPEG_INCLUDE_DIR
+  VERSION_VAR FFMPEG_VERSION
+)
+
+if(FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
+  # in cache already
+  set(FFMPEG_FOUND TRUE)
+else()
+  # use pkg-config to get the directories and then use these values
+  # in the FIND_PATH() and FIND_LIBRARY() calls
+  find_package(PkgConfig)
+  if(PKG_CONFIG_FOUND)
+    pkg_check_modules(_FFMPEG_AVCODEC libavcodec)
+    pkg_check_modules(_FFMPEG_AVUTIL libavutil)
+    pkg_check_modules(_FFMPEG_SWSCALE libswscale)
+  endif()
+
+  find_path(FFMPEG_AVCODEC_INCLUDE_DIR
+    NAMES libavcodec/avcodec.h
+    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS}
+      /usr/include
+      /usr/local/include
+      /opt/local/include
+      /sw/include
+    PATH_SUFFIXES ffmpeg libav)
+
+  find_library(FFMPEG_LIBAVCODEC
+    NAMES avcodec
+    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS}
+      /usr/lib
+      /usr/local/lib
+      /opt/local/lib
+      /sw/lib)
+
+  find_library(FFMPEG_LIBAVUTIL
+    NAMES avutil
+    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS}
+      /usr/lib
+      /usr/local/lib
+      /opt/local/lib
+      /sw/lib)
+
+  find_library(FFMPEG_LIBSWSCALE
+    NAMES swscale
+    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS}
+      /usr/lib
+      /usr/local/lib
+      /opt/local/lib
+      /sw/lib)
+
+  if(FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVUTIL AND FFMPEG_LIBSWSCALE)
+    set(FFMPEG_FOUND TRUE)
+  endif()
+
+  if(FFMPEG_FOUND)
+    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
+    set(FFMPEG_LIBRARIES
+      ${FFMPEG_LIBAVCODEC}
+      ${FFMPEG_LIBAVUTIL}
+      ${FFMPEG_LIBSWSCALE})
+  endif()
+
+  if(FFMPEG_FOUND)
+    if(NOT FFMPEG_FIND_QUIETLY)
+      message(STATUS
+      "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
+    endif()
+  else()
+    if(FFMPEG_FIND_REQUIRED)
+      message(FATAL_ERROR
+      "Could not find libavcodec or libavutil or libswscale")
+    endif()
+  endif()
+endif()
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 0fb5d9708..e50ab2922 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -150,6 +150,8 @@ add_library(common STATIC
     scope_exit.h
     spin_lock.cpp
     spin_lock.h
+    stream.cpp
+    stream.h
     string_util.cpp
     string_util.h
     swap.h
diff --git a/src/common/stream.cpp b/src/common/stream.cpp
new file mode 100644
index 000000000..bf0496c26
--- /dev/null
+++ b/src/common/stream.cpp
@@ -0,0 +1,47 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <stdexcept>
+#include "common/common_types.h"
+#include "common/stream.h"
+
+namespace Common {
+
+Stream::Stream() = default;
+Stream::~Stream() = default;
+
+void Stream::Seek(s32 offset, SeekOrigin origin) {
+    if (origin == SeekOrigin::SetOrigin) {
+        if (offset < 0) {
+            position = 0;
+        } else if (position >= buffer.size()) {
+            position = buffer.size();
+        } else {
+            position = offset;
+        }
+    } else if (origin == SeekOrigin::FromCurrentPos) {
+        Seek(static_cast<s32>(position) + offset, SeekOrigin::SetOrigin);
+    } else if (origin == SeekOrigin::FromEnd) {
+        Seek(static_cast<s32>(buffer.size()) - offset, SeekOrigin::SetOrigin);
+    }
+}
+
+u8 Stream::ReadByte() {
+    if (position < buffer.size()) {
+        return buffer[position++];
+    } else {
+        throw std::out_of_range("Attempting to read a byte not within the buffer range");
+    }
+}
+
+void Stream::WriteByte(u8 byte) {
+    if (position == buffer.size()) {
+        buffer.push_back(byte);
+        position++;
+    } else {
+        buffer.insert(buffer.begin() + position, byte);
+    }
+}
+
+} // namespace Common
diff --git a/src/common/stream.h b/src/common/stream.h
new file mode 100644
index 000000000..2585c16af
--- /dev/null
+++ b/src/common/stream.h
@@ -0,0 +1,50 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_types.h"
+
+namespace Common {
+
+enum class SeekOrigin {
+    SetOrigin,
+    FromCurrentPos,
+    FromEnd,
+};
+
+class Stream {
+public:
+    /// Stream creates a bitstream and provides common functionality on the stream.
+    explicit Stream();
+    ~Stream();
+
+    /// Reposition bitstream "cursor" to the specified offset from origin
+    void Seek(s32 offset, SeekOrigin origin);
+
+    /// Reads next byte in the stream buffer and increments position
+    u8 ReadByte();
+
+    /// Writes byte at current position
+    void WriteByte(u8 byte);
+
+    std::size_t GetPosition() const {
+        return position;
+    }
+
+    std::vector<u8>& GetBuffer() {
+        return buffer;
+    }
+
+    const std::vector<u8>& GetBuffer() const {
+        return buffer;
+    }
+
+private:
+    std::vector<u8> buffer;
+    std::size_t position{0};
+};
+
+} // namespace Common
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index db1c9fdef..e0f207f3e 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -439,6 +439,8 @@ add_library(core STATIC
     hle/service/nvdrv/devices/nvhost_gpu.h
     hle/service/nvdrv/devices/nvhost_nvdec.cpp
     hle/service/nvdrv/devices/nvhost_nvdec.h
+    hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+    hle/service/nvdrv/devices/nvhost_nvdec_common.h
     hle/service/nvdrv/devices/nvhost_nvjpg.cpp
     hle/service/nvdrv/devices/nvhost_nvjpg.h
     hle/service/nvdrv/devices/nvhost_vic.cpp
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index fcb612864..b6df48360 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <cstring>
-
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
 
 namespace Service::Nvidia::Devices {
 
-nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
+nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 nvhost_nvdec::~nvhost_nvdec() = default;
 
 u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -21,7 +23,7 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
 
     switch (static_cast<IoctlCommand>(command.raw)) {
     case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
     case IoctlCommand::IocSubmit:
         return Submit(input, output);
     case IoctlCommand::IocGetSyncpoint:
@@ -29,79 +31,29 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
     case IoctlCommand::IocGetWaitbase:
         return GetWaitbase(input, output);
     case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
+    case IoctlCommand::IocMapBuffer3:
     case IoctlCommand::IocMapBufferEx:
-        return MapBufferEx(input, output);
-    case IoctlCommand::IocUnmapBufferEx:
-        return UnmapBufferEx(input, output);
+        return MapBuffer(input, output);
+    case IoctlCommand::IocUnmapBufferEx: {
+        // This command is sent when the video stream has ended, flush all video contexts
+        // This is usually sent in the folowing order: vic, nvdec, vic.
+        // Inform the GPU to clear any remaining nvdec buffers when this is detected.
+        LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
+        Tegra::ChCommandHeaderList cmdlist(1);
+        cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
+        system.GPU().PushCommandBuffer(cmdlist);
+        [[fallthrough]]; // fallthrough to unmap buffers
+    };
+    case IoctlCommand::IocUnmapBuffer:
+    case IoctlCommand::IocUnmapBuffer2:
+    case IoctlCommand::IocUnmapBuffer3:
+        return UnmapBuffer(input, output);
+    case IoctlCommand::IocSetSubmitTimeout:
+        return SetSubmitTimeout(input, output);
     }
 
-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
-    return 0;
-}
-
-u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
-    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
-
-    nvmap_fd = params.nvmap_fd;
-    return 0;
-}
-
-u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSubmit params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
-    return 0;
-}
-
-u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetSyncpoint params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
-    return 0;
-}
-
-u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetWaitbase params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
-    return 0;
-}
-
-u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBuffer params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
-    return 0;
-}
-
-u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
-    return 0;
-}
-
-u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlUnmapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
     return 0;
 }
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
index 4332db118..102777ddd 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -4,16 +4,14 @@
 
 #pragma once
 
-#include <vector>
-#include "common/common_types.h"
-#include "common/swap.h"
-#include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include <memory>
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
 
 namespace Service::Nvidia::Devices {
 
-class nvhost_nvdec final : public nvdevice {
+class nvhost_nvdec final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_nvdec(Core::System& system);
+    explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
     ~nvhost_nvdec() override;
 
     u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -27,62 +25,15 @@ private:
         IocGetSyncpoint = 0xC0080002,
         IocGetWaitbase = 0xC0080003,
         IocMapBuffer = 0xC01C0009,
+        IocMapBuffer2 = 0xC16C0009,
+        IocMapBuffer3 = 0xC15C0009,
         IocMapBufferEx = 0xC0A40009,
-        IocUnmapBufferEx = 0xC0A4000A,
+        IocUnmapBuffer = 0xC0A4000A,
+        IocUnmapBuffer2 = 0xC16C000A,
+        IocUnmapBufferEx = 0xC01C000A,
+        IocUnmapBuffer3 = 0xC15C000A,
+        IocSetSubmitTimeout = 0x40040007,
     };
-
-    struct IoctlSetNvmapFD {
-        u32_le nvmap_fd;
-    };
-    static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size");
-
-    struct IoctlSubmit {
-        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size");
-
-    struct IoctlGetSyncpoint {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size");
-
-    struct IoctlGetWaitbase {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size");
-
-    struct IoctlMapBuffer {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
-
-    struct IoctlMapBufferEx {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size");
-
-    struct IoctlUnmapBufferEx {
-        INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size");
-
-    u32_le nvmap_fd{};
-
-    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
new file mode 100644
index 000000000..85792495f
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -0,0 +1,234 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
+#include "core/hle/service/nvdrv/devices/nvmap.h"
+#include "core/memory.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
+
+namespace Service::Nvidia::Devices {
+
+namespace {
+// Splice vectors will copy count amount of type T from the input vector into the dst vector.
+template <typename T>
+std::size_t SpliceVectors(const std::vector<u8>& input, std::vector<T>& dst, std::size_t count,
+                          std::size_t offset) {
+    std::memcpy(dst.data(), input.data() + offset, count * sizeof(T));
+    offset += count * sizeof(T);
+    return offset;
+}
+
+// Write vectors will write data to the output buffer
+template <typename T>
+std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) {
+    std::memcpy(dst.data() + offset, src.data(), src.size() * sizeof(T));
+    offset += src.size() * sizeof(T);
+    return offset;
+}
+} // Anonymous namespace
+
+namespace NvErrCodes {
+constexpr u32 Success{};
+constexpr u32 OutOfMemory{static_cast<u32>(-12)};
+constexpr u32 InvalidInput{static_cast<u32>(-22)};
+} // namespace NvErrCodes
+
+nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_nvdec_common::~nvhost_nvdec_common() = default;
+
+u32 nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) {
+    IoctlSetNvmapFD params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
+    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
+
+    nvmap_fd = params.nvmap_fd;
+    return 0;
+}
+
+u32 nvhost_nvdec_common::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSubmit params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
+    LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count);
+
+    // Instantiate param buffers
+    std::size_t offset = sizeof(IoctlSubmit);
+    std::vector<CommandBuffer> command_buffers(params.cmd_buffer_count);
+    std::vector<Reloc> relocs(params.relocation_count);
+    std::vector<u32> reloc_shifts(params.relocation_count);
+    std::vector<SyncptIncr> syncpt_increments(params.syncpoint_count);
+    std::vector<SyncptIncr> wait_checks(params.syncpoint_count);
+    std::vector<Fence> fences(params.fence_count);
+
+    // Splice input into their respective buffers
+    offset = SpliceVectors(input, command_buffers, params.cmd_buffer_count, offset);
+    offset = SpliceVectors(input, relocs, params.relocation_count, offset);
+    offset = SpliceVectors(input, reloc_shifts, params.relocation_count, offset);
+    offset = SpliceVectors(input, syncpt_increments, params.syncpoint_count, offset);
+    offset = SpliceVectors(input, wait_checks, params.syncpoint_count, offset);
+    offset = SpliceVectors(input, fences, params.fence_count, offset);
+
+    // TODO(ameerj): For async gpu, utilize fences for syncpoint 'max' increment
+
+    auto& gpu = system.GPU();
+
+    for (const auto& cmd_buffer : command_buffers) {
+        auto object = nvmap_dev->GetObject(cmd_buffer.memory_id);
+        ASSERT_OR_EXECUTE(object, return NvErrCodes::InvalidInput;);
+        const auto map = FindBufferMap(object->dma_map_addr);
+        if (!map) {
+            LOG_ERROR(Service_NVDRV, "Tried to submit an invalid offset 0x{:X} dma 0x{:X}",
+                      object->addr, object->dma_map_addr);
+            return 0;
+        }
+        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
+        gpu.MemoryManager().ReadBlock(map->StartAddr() + cmd_buffer.offset, cmdlist.data(),
+                                      cmdlist.size() * sizeof(u32));
+        gpu.PushCommandBuffer(cmdlist);
+    }
+
+    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
+    // Some games expect command_buffers to be written back
+    offset = sizeof(IoctlSubmit);
+    offset = WriteVectors(output, command_buffers, offset);
+    offset = WriteVectors(output, relocs, offset);
+    offset = WriteVectors(output, reloc_shifts, offset);
+    offset = WriteVectors(output, syncpt_increments, offset);
+    offset = WriteVectors(output, wait_checks, offset);
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetSyncpoint params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
+    LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
+
+    // We found that implementing this causes deadlocks with async gpu, along with degraded
+    // performance. TODO: RE the nvdec async implementation
+    params.value = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetWaitbase params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
+    return 0;
+}
+
+u32 nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
+
+    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
+
+    auto& gpu = system.GPU();
+
+    for (auto& cmf_buff : cmd_buffer_handles) {
+        auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
+        if (!object) {
+            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
+            std::memcpy(output.data(), &params, output.size());
+            return NvErrCodes::InvalidInput;
+        }
+        if (object->dma_map_addr == 0) {
+            // NVDEC and VIC memory is in the 32-bit address space
+            // MapAllocate32 will attempt to map a lower 32-bit value in the shared gpu memory space
+            const GPUVAddr low_addr = gpu.MemoryManager().MapAllocate32(object->addr, object->size);
+            object->dma_map_addr = static_cast<u32>(low_addr);
+            // Ensure that the dma_map_addr is indeed in the lower 32-bit address space.
+            ASSERT(object->dma_map_addr == low_addr);
+        }
+        if (!object->dma_map_addr) {
+            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
+        } else {
+            cmf_buff.map_address = object->dma_map_addr;
+            AddBufferMap(object->dma_map_addr, object->size, object->addr,
+                         object->status == nvmap::Object::Status::Allocated);
+        }
+    }
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
+    std::memcpy(output.data() + sizeof(IoctlMapBuffer), cmd_buffer_handles.data(),
+                cmd_buffer_handles.size() * sizeof(MapBufferEntry));
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
+    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
+
+    auto& gpu = system.GPU();
+
+    for (auto& cmf_buff : cmd_buffer_handles) {
+        const auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
+        if (!object) {
+            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
+            std::memcpy(output.data(), &params, output.size());
+            return NvErrCodes::InvalidInput;
+        }
+        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
+            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
+        } else {
+            // This occurs quite frequently, however does not seem to impact functionality
+            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
+                      object->dma_map_addr);
+        }
+        object->dma_map_addr = 0;
+    }
+    std::memset(output.data(), 0, output.size());
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output) {
+    std::memcpy(&submit_timeout, input.data(), input.size());
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    return NvErrCodes::Success;
+}
+
+std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
+    GPUVAddr gpu_addr) const {
+    const auto it = std::find_if(
+        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
+            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
+        });
+
+    ASSERT(it != buffer_mappings.end());
+    return it->second;
+}
+
+void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
+                                       bool is_allocated) {
+    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
+}
+
+std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
+    const auto iter{buffer_mappings.find(gpu_addr)};
+    if (iter == buffer_mappings.end()) {
+        return std::nullopt;
+    }
+    std::size_t size = 0;
+    if (iter->second.IsAllocated()) {
+        size = iter->second.Size();
+    }
+    buffer_mappings.erase(iter);
+    return size;
+}
+
+} // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
new file mode 100644
index 000000000..c249c5349
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -0,0 +1,168 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "common/common_types.h"
+#include "common/swap.h"
+#include "core/hle/service/nvdrv/devices/nvdevice.h"
+
+namespace Service::Nvidia::Devices {
+class nvmap;
+
+class nvhost_nvdec_common : public nvdevice {
+public:
+    explicit nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
+    ~nvhost_nvdec_common() override;
+
+    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
+                      std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
+                      IoctlVersion version) = 0;
+
+protected:
+    class BufferMap final {
+    public:
+        constexpr BufferMap() = default;
+
+        constexpr BufferMap(GPUVAddr start_addr, std::size_t size)
+            : start_addr{start_addr}, end_addr{start_addr + size} {}
+
+        constexpr BufferMap(GPUVAddr start_addr, std::size_t size, VAddr cpu_addr,
+                            bool is_allocated)
+            : start_addr{start_addr}, end_addr{start_addr + size}, cpu_addr{cpu_addr},
+              is_allocated{is_allocated} {}
+
+        constexpr VAddr StartAddr() const {
+            return start_addr;
+        }
+
+        constexpr VAddr EndAddr() const {
+            return end_addr;
+        }
+
+        constexpr std::size_t Size() const {
+            return end_addr - start_addr;
+        }
+
+        constexpr VAddr CpuAddr() const {
+            return cpu_addr;
+        }
+
+        constexpr bool IsAllocated() const {
+            return is_allocated;
+        }
+
+    private:
+        GPUVAddr start_addr{};
+        GPUVAddr end_addr{};
+        VAddr cpu_addr{};
+        bool is_allocated{};
+    };
+
+    struct IoctlSetNvmapFD {
+        u32_le nvmap_fd;
+    };
+    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
+
+    struct IoctlSubmitCommandBuffer {
+        u32_le id;
+        u32_le offset;
+        u32_le count;
+    };
+    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
+                  "IoctlSubmitCommandBuffer is incorrect size");
+    struct IoctlSubmit {
+        u32_le cmd_buffer_count;
+        u32_le relocation_count;
+        u32_le syncpoint_count;
+        u32_le fence_count;
+    };
+    static_assert(sizeof(IoctlSubmit) == 0x10, "IoctlSubmit has incorrect size");
+
+    struct CommandBuffer {
+        s32 memory_id;
+        u32 offset;
+        s32 word_count;
+    };
+    static_assert(sizeof(CommandBuffer) == 0xC, "CommandBuffer has incorrect size");
+
+    struct Reloc {
+        s32 cmdbuffer_memory;
+        s32 cmdbuffer_offset;
+        s32 target;
+        s32 target_offset;
+    };
+    static_assert(sizeof(Reloc) == 0x10, "CommandBuffer has incorrect size");
+
+    struct SyncptIncr {
+        u32 id;
+        u32 increments;
+    };
+    static_assert(sizeof(SyncptIncr) == 0x8, "CommandBuffer has incorrect size");
+
+    struct Fence {
+        u32 id;
+        u32 value;
+    };
+    static_assert(sizeof(Fence) == 0x8, "CommandBuffer has incorrect size");
+
+    struct IoctlGetSyncpoint {
+        // Input
+        u32_le param;
+        // Output
+        u32_le value;
+    };
+    static_assert(sizeof(IoctlGetSyncpoint) == 8, "IocGetIdParams has wrong size");
+
+    struct IoctlGetWaitbase {
+        u32_le unknown; // seems to be ignored? Nintendo added this
+        u32_le value;
+    };
+    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
+
+    struct IoctlMapBuffer {
+        u32_le num_entries;
+        u32_le data_address; // Ignored by the driver.
+        u32_le attach_host_ch_das;
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
+
+    struct IocGetIdParams {
+        // Input
+        u32_le param;
+        // Output
+        u32_le value;
+    };
+    static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size");
+
+    // Used for mapping and unmapping command buffers
+    struct MapBufferEntry {
+        u32_le map_handle;
+        u32_le map_address;
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
+
+    /// Ioctl command implementations
+    u32 SetNVMAPfd(const std::vector<u8>& input);
+    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);
+
+    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
+    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
+    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
+
+    u32_le nvmap_fd{};
+    u32_le submit_timeout{};
+    std::shared_ptr<nvmap> nvmap_dev;
+
+    // This is expected to be ordered, therefore we must use a map, not unordered_map
+    std::map<GPUVAddr, BufferMap> buffer_mappings;
+};
+}; // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index 9da19ad56..60db54d00 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <cstring>
-
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
 
 namespace Service::Nvidia::Devices {
+nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 
-nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
 nvhost_vic::~nvhost_vic() = default;
 
 u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -21,7 +23,7 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
 
     switch (static_cast<IoctlCommand>(command.raw)) {
     case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
     case IoctlCommand::IocSubmit:
         return Submit(input, output);
     case IoctlCommand::IocGetSyncpoint:
@@ -29,83 +31,19 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
     case IoctlCommand::IocGetWaitbase:
         return GetWaitbase(input, output);
     case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
+    case IoctlCommand::IocMapBuffer3:
+    case IoctlCommand::IocMapBuffer4:
     case IoctlCommand::IocMapBufferEx:
         return MapBuffer(input, output);
+    case IoctlCommand::IocUnmapBuffer:
+    case IoctlCommand::IocUnmapBuffer2:
+    case IoctlCommand::IocUnmapBuffer3:
     case IoctlCommand::IocUnmapBufferEx:
-        return UnmapBufferEx(input, output);
+        return UnmapBuffer(input, output);
     }
 
-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
-    return 0;
-}
-
-u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
-    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
-
-    nvmap_fd = params.nvmap_fd;
-    return 0;
-}
-
-u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSubmit params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-
-    // Workaround for Luigi's Mansion 3, as nvhost_vic is not implemented for asynch GPU
-    params.command_buffer = {};
-
-    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
-    return 0;
-}
-
-u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetSyncpoint params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
-    return 0;
-}
-
-u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetWaitbase params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
-    return 0;
-}
-
-u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBuffer params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
-    return 0;
-}
-
-u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
-    return 0;
-}
-
-u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlUnmapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
     return 0;
 }
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
index a7bb7bbd5..f975b190c 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -4,19 +4,15 @@
 
 #pragma once
 
-#include <array>
-#include <vector>
-#include "common/common_types.h"
-#include "common/swap.h"
-#include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
 
 namespace Service::Nvidia::Devices {
+class nvmap;
 
-class nvhost_vic final : public nvdevice {
+class nvhost_vic final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_vic(Core::System& system);
-    ~nvhost_vic() override;
-
+    explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
+    ~nvhost_vic();
     u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
               std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
               IoctlVersion version) override;
@@ -28,74 +24,14 @@ private:
         IocGetSyncpoint = 0xC0080002,
         IocGetWaitbase = 0xC0080003,
         IocMapBuffer = 0xC01C0009,
+        IocMapBuffer2 = 0xC0340009,
+        IocMapBuffer3 = 0xC0140009,
+        IocMapBuffer4 = 0xC00C0009,
         IocMapBufferEx = 0xC03C0009,
-        IocUnmapBufferEx = 0xC03C000A,
+        IocUnmapBuffer = 0xC03C000A,
+        IocUnmapBuffer2 = 0xC034000A,
+        IocUnmapBuffer3 = 0xC00C000A,
+        IocUnmapBufferEx = 0xC01C000A,
     };
-
-    struct IoctlSetNvmapFD {
-        u32_le nvmap_fd;
-    };
-    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
-
-    struct IoctlSubmitCommandBuffer {
-        u32 id;
-        u32 offset;
-        u32 count;
-    };
-    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
-                  "IoctlSubmitCommandBuffer is incorrect size");
-
-    struct IoctlSubmit {
-        u32 command_buffer_count;
-        u32 relocations_count;
-        u32 syncpt_count;
-        u32 wait_count;
-        std::array<IoctlSubmitCommandBuffer, 4> command_buffer;
-    };
-    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size");
-
-    struct IoctlGetSyncpoint {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size");
-
-    struct IoctlGetWaitbase {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
-
-    struct IoctlMapBuffer {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
-
-    struct IoctlMapBufferEx {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size");
-
-    struct IoctlUnmapBufferEx {
-        INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size");
-
-    u32_le nvmap_fd{};
-
-    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };
-
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h
index 84624be00..04b9ef540 100644
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@@ -37,6 +37,7 @@ public:
         VAddr addr;
         Status status;
         u32 refcount;
+        u32 dma_map_addr;
     };
 
     std::shared_ptr<Object> GetObject(u32 handle) const {
diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp
index 197c77db0..803c1a984 100644
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -51,9 +51,9 @@ Module::Module(Core::System& system) {
     devices["/dev/nvmap"] = nvmap_dev;
     devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
     devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
-    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
+    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);
     devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
-    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);
 }
 
 Module::~Module() = default;
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index 28d3f9099..e14c02045 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -63,6 +63,7 @@ void LogSettings() {
     log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue());
     log_setting("Renderer_UseAsynchronousGpuEmulation",
                 values.use_asynchronous_gpu_emulation.GetValue());
+    log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());
     log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
     log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());
     log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
@@ -119,6 +120,7 @@ void RestoreGlobalState() {
     values.use_disk_shader_cache.SetGlobal(true);
     values.gpu_accuracy.SetGlobal(true);
     values.use_asynchronous_gpu_emulation.SetGlobal(true);
+    values.use_nvdec_emulation.SetGlobal(true);
     values.use_vsync.SetGlobal(true);
     values.use_assembly_shaders.SetGlobal(true);
     values.use_asynchronous_shaders.SetGlobal(true);
diff --git a/src/core/settings.h b/src/core/settings.h
index 9834f44bb..604805615 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -111,6 +111,7 @@ struct Values {
     Setting<bool> use_disk_shader_cache;
     Setting<GPUAccuracy> gpu_accuracy;
     Setting<bool> use_asynchronous_gpu_emulation;
+    Setting<bool> use_nvdec_emulation;
     Setting<bool> use_vsync;
     Setting<bool> use_assembly_shaders;
     Setting<bool> use_asynchronous_shaders;
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index da09c0dbc..ebc19e18a 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -206,6 +206,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
              TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));
     AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
              Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    AddField(field_type, "Renderer_UseNvdecEmulation",
+             Settings::values.use_nvdec_emulation.GetValue());
     AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
     AddField(field_type, "Renderer_UseAssemblyShaders",
              Settings::values.use_assembly_shaders.GetValue());
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 77ebac19f..fdfc885fc 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -5,6 +5,24 @@ add_library(video_core STATIC
     buffer_cache/buffer_cache.h
     buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
+    cdma_pusher.cpp
+    cdma_pusher.h
+    command_classes/codecs/codec.cpp
+    command_classes/codecs/codec.h
+    command_classes/codecs/h264.cpp
+    command_classes/codecs/h264.h
+    command_classes/codecs/vp9.cpp
+    command_classes/codecs/vp9.h
+    command_classes/codecs/vp9_types.h
+    command_classes/host1x.cpp
+    command_classes/host1x.h
+    command_classes/nvdec.cpp
+    command_classes/nvdec.h
+    command_classes/nvdec_common.h
+    command_classes/sync_manager.cpp
+    command_classes/sync_manager.h
+    command_classes/vic.cpp
+    command_classes/vic.h
     compatible_formats.cpp
     compatible_formats.h
     dirty_flags.cpp
@@ -250,6 +268,14 @@ create_target_directory_groups(video_core)
 target_link_libraries(video_core PUBLIC common core)
 target_link_libraries(video_core PRIVATE glad xbyak)
 
+if (MSVC)
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
+else()
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
+endif()
+
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
 
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
new file mode 100644
index 000000000..d774db107
--- /dev/null
+++ b/src/video_core/cdma_pusher.cpp
@@ -0,0 +1,171 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "command_classes/host1x.h"
+#include "command_classes/nvdec.h"
+#include "command_classes/vic.h"
+#include "common/bit_util.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+CDmaPusher::CDmaPusher(GPU& gpu)
+    : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)),
+      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x>(gpu)),
+      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
+      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
+
+CDmaPusher::~CDmaPusher() = default;
+
+void CDmaPusher::Push(ChCommandHeaderList&& entries) {
+    cdma_queue.push(std::move(entries));
+}
+
+void CDmaPusher::DispatchCalls() {
+    while (!cdma_queue.empty()) {
+        Step();
+    }
+}
+
+void CDmaPusher::Step() {
+    const auto entries{cdma_queue.front()};
+    cdma_queue.pop();
+
+    std::vector<u32> values(entries.size());
+    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
+
+    for (const u32 value : values) {
+        if (mask != 0) {
+            const u32 lbs = Common::CountTrailingZeroes32(mask);
+            mask &= ~(1U << lbs);
+            ExecuteCommand(static_cast<u32>(offset + lbs), value);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(static_cast<u32>(offset), value);
+            if (incrementing) {
+                ++offset;
+            }
+            continue;
+        }
+        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value & 0x3f;
+            offset = (value >> 16) & 0xfff;
+            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value & 0xfff;
+            offset = (value >> 16) & 0xfff;
+            ExecuteCommand(static_cast<u32>(offset), data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
+        }
+    }
+}
+
+void CDmaPusher::ExecuteCommand(u32 offset, u32 data) {
+    switch (current_class) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, offset, {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                nvdec_sync->Increment(syncpoint_id);
+            } else {
+                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                nvdec_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(
+                static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data});
+        switch (static_cast<ThiMethod>(offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                vic_sync->Increment(syncpoint_id);
+            } else {
+                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                vic_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0));
+            vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0),
+                                         {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Host1x:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data});
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
+    }
+}
+
+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset;
+    std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size());
+}
+
+} // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
new file mode 100644
index 000000000..982f309c5
--- /dev/null
+++ b/src/video_core/cdma_pusher.h
@@ -0,0 +1,138 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/sync_manager.h"
+
+namespace Tegra {
+
+class GPU;
+class Nvdec;
+class Vic;
+class Host1x;
+
+enum class ChSubmissionMode : u32 {
+    SetClass = 0,
+    Incrementing = 1,
+    NonIncrementing = 2,
+    Mask = 3,
+    Immediate = 4,
+    Restart = 5,
+    Gather = 6,
+};
+
+enum class ChClassId : u32 {
+    NoClass = 0x0,
+    Host1x = 0x1,
+    VideoEncodeMpeg = 0x20,
+    VideoEncodeNvEnc = 0x21,
+    VideoStreamingVi = 0x30,
+    VideoStreamingIsp = 0x32,
+    VideoStreamingIspB = 0x34,
+    VideoStreamingViI2c = 0x36,
+    GraphicsVic = 0x5d,
+    Graphics3D = 0x60,
+    GraphicsGpu = 0x61,
+    Tsec = 0xe0,
+    TsecB = 0xe1,
+    NvJpg = 0xc0,
+    NvDec = 0xf0
+};
+
+enum class ChMethod : u32 {
+    Empty = 0,
+    SetMethod = 0x10,
+    SetData = 0x11,
+};
+
+union ChCommandHeader {
+    u32 raw;
+    BitField<0, 16, u32> value;
+    BitField<16, 12, ChMethod> method_offset;
+    BitField<28, 4, ChSubmissionMode> submission_mode;
+};
+static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
+
+struct ChCommand {
+    ChClassId class_id{};
+    int method_offset{};
+    std::vector<u32> arguments;
+};
+
+using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>;
+using ChCommandList = std::vector<Tegra::ChCommand>;
+
+struct ThiRegisters {
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
+};
+
+enum class ThiMethod : u32 {
+    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
+    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
+    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
+};
+
+class CDmaPusher {
+public:
+    explicit CDmaPusher(GPU& gpu);
+    ~CDmaPusher();
+
+    /// Push NVDEC command buffer entries into queue
+    void Push(ChCommandHeaderList&& entries);
+
+    /// Process queued command buffer entries
+    void DispatchCalls();
+
+    /// Process one queue element
+    void Step();
+
+    /// Invoke command class devices to execute the command based on the current state
+    void ExecuteCommand(u32 offset, u32 data);
+
+private:
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments);
+
+    GPU& gpu;
+
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+    std::unique_ptr<Tegra::Vic> vic_processor;
+    std::unique_ptr<Tegra::Host1x> host1x_processor;
+    std::unique_ptr<SyncptIncrManager> nvdec_sync;
+    std::unique_ptr<SyncptIncrManager> vic_sync;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};
+
+    s32 count{};
+    s32 offset{};
+    s32 mask{};
+    bool incrementing{};
+
+    // Queue of command lists to be processed
+    std::queue<ChCommandHeaderList> cdma_queue;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
new file mode 100644
index 000000000..2df410be8
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -0,0 +1,114 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <fstream>
+#include "common/assert.h"
+#include "video_core/command_classes/codecs/codec.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+extern "C" {
+#include <libavutil/opt.h>
+}
+
+namespace Tegra {
+
+Codec::Codec(GPU& gpu_)
+    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    avcodec_send_packet(av_codec_ctx, nullptr);
+    avcodec_receive_frame(av_codec_ctx, av_frame);
+    avcodec_flush_buffers(av_codec_ctx);
+
+    av_frame_unref(av_frame);
+    av_free(av_frame);
+    avcodec_close(av_codec_ctx);
+}
+
+void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
+    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec));
+    current_codec = codec;
+}
+
+void Codec::StateWrite(u32 offset, u64 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
+    std::memcpy(state_offset, &arguments, sizeof(u64));
+}
+
+void Codec::Decode() {
+    bool is_first_frame = false;
+
+    if (!initialized) {
+        if (current_codec == NvdecCommon::VideoCodec::H264) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
+        } else {
+            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec));
+            return;
+        }
+
+        av_codec_ctx = avcodec_alloc_context3(av_codec);
+        av_frame = av_frame_alloc();
+        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+
+        // TODO(ameerj): libavcodec gpu hw acceleration
+
+        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
+        if (av_error < 0) {
+            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
+            av_frame_unref(av_frame);
+            av_free(av_frame);
+            avcodec_close(av_codec_ctx);
+            return;
+        }
+        initialized = true;
+        is_first_frame = true;
+    }
+    bool vp9_hidden_frame = false;
+
+    AVPacket packet{};
+    av_init_packet(&packet);
+    std::vector<u8> frame_data;
+
+    if (current_codec == NvdecCommon::VideoCodec::H264) {
+        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
+    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+        frame_data = vp9_decoder->ComposeFrameHeader(state);
+        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
+    }
+
+    packet.data = frame_data.data();
+    packet.size = static_cast<int>(frame_data.size());
+
+    avcodec_send_packet(av_codec_ctx, &packet);
+
+    if (!vp9_hidden_frame) {
+        // Only receive/store visible frames
+        avcodec_receive_frame(av_codec_ctx, av_frame);
+    }
+}
+
+AVFrame* Codec::GetCurrentFrame() {
+    return av_frame;
+}
+
+const AVFrame* Codec::GetCurrentFrame() const {
+    return av_frame;
+}
+
+NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
+    return current_codec;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
new file mode 100644
index 000000000..2e56daf29
--- /dev/null
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -0,0 +1,68 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#include <libavcodec/avcodec.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace Tegra {
+class GPU;
+struct VicRegisters;
+
+namespace Decoder {
+class H264;
+class VP9;
+} // namespace Decoder
+
+class Codec {
+public:
+    explicit Codec(GPU& gpu);
+    ~Codec();
+
+    /// Sets NVDEC video stream codec
+    void SetTargetCodec(NvdecCommon::VideoCodec codec);
+
+    /// Populate NvdecRegisters state with argument value at the provided offset
+    void StateWrite(u32 offset, u64 arguments);
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    /// Returns most recently decoded frame
+    AVFrame* GetCurrentFrame();
+    const AVFrame* GetCurrentFrame() const;
+
+    /// Returns the value of current_codec
+    NvdecCommon::VideoCodec GetCurrentCodec() const;
+
+private:
+    bool initialized{};
+    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
+
+    AVCodec* av_codec{nullptr};
+    AVCodecContext* av_codec_ctx{nullptr};
+    AVFrame* av_frame{nullptr};
+
+    GPU& gpu;
+    std::unique_ptr<Decoder::H264> h264_decoder;
+    std::unique_ptr<Decoder::VP9> vp9_decoder;
+
+    NvdecCommon::NvdecRegisters state{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
new file mode 100644
index 000000000..1a39f7b23
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -0,0 +1,276 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "common/bit_util.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+H264::H264(GPU& gpu_) : gpu(gpu_) {}
+
+H264::~H264() = default;
+
+std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) {
+    H264DecoderContext context{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+
+    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
+    if (!is_first_frame && frame_number != 0) {
+        frame.resize(context.frame_data_size);
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+    } else {
+        /// Encode header
+        H264BitWriter writer{};
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(7, 5);
+        writer.WriteU(100, 8);
+        writer.WriteU(0, 8);
+        writer.WriteU(31, 8);
+        writer.WriteUe(0);
+        const s32 chroma_format_idc = (context.h264_parameter_set.flags >> 12) & 0x3;
+        writer.WriteUe(chroma_format_idc);
+        if (chroma_format_idc == 3) {
+            writer.WriteBit(false);
+        }
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+        writer.WriteBit(false); // Scaling matrix present flag
+
+        const s32 order_cnt_type = static_cast<s32>((context.h264_parameter_set.flags >> 14) & 3);
+        writer.WriteUe(static_cast<s32>((context.h264_parameter_set.flags >> 8) & 0xf));
+        writer.WriteUe(order_cnt_type);
+        if (order_cnt_type == 0) {
+            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
+        } else if (order_cnt_type == 1) {
+            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+
+            writer.WriteSe(0);
+            writer.WriteSe(0);
+            writer.WriteUe(0);
+        }
+
+        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
+                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+
+        writer.WriteUe(16);
+        writer.WriteBit(false);
+        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+        writer.WriteUe(pic_height - 1);
+        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+
+        if (!context.h264_parameter_set.frame_mbs_only_flag) {
+            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
+        }
+
+        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
+        writer.WriteBit(false); // Frame cropping flag
+        writer.WriteBit(false); // VUI parameter present flag
+
+        writer.End();
+
+        // H264 PPS
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(8, 5);
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+
+        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag);
+        writer.WriteBit(false);
+        writer.WriteUe(0);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
+        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
+        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
+        pic_init_qp = (pic_init_qp << 26) >> 26;
+        writer.WriteSe(pic_init_qp);
+        writer.WriteSe(0);
+        s32 chroma_qp_index_offset =
+            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
+        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset);
+        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
+        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+
+        writer.WriteBit(true);
+
+        for (s32 index = 0; index < 6; index++) {
+            writer.WriteBit(true);
+            const auto matrix_x4 =
+                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
+            writer.WriteScalingList(matrix_x4, index * 16, 16);
+        }
+
+        if (context.h264_parameter_set.transform_8x8_mode_flag) {
+            for (s32 index = 0; index < 2; index++) {
+                writer.WriteBit(true);
+                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
+                                                       context.scaling_matrix_8.end());
+
+                writer.WriteScalingList(matrix_x8, index * 64, 64);
+            }
+        }
+
+        s32 chroma_qp_index_offset2 =
+            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
+        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset2);
+
+        writer.End();
+
+        const auto& encoded_header = writer.GetByteArray();
+        frame.resize(encoded_header.size() + context.frame_data_size);
+        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
+                                      frame.data() + encoded_header.size(),
+                                      context.frame_data_size);
+    }
+
+    return frame;
+}
+
+H264BitWriter::H264BitWriter() = default;
+
+H264BitWriter::~H264BitWriter() = default;
+
+void H264BitWriter::WriteU(s32 value, s32 value_sz) {
+    WriteBits(value, value_sz);
+}
+
+void H264BitWriter::WriteSe(s32 value) {
+    WriteExpGolombCodedInt(value);
+}
+
+void H264BitWriter::WriteUe(s32 value) {
+    WriteExpGolombCodedUInt((u32)value);
+}
+
+void H264BitWriter::End() {
+    WriteBit(true);
+    Flush();
+}
+
+void H264BitWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
+    std::vector<u8> scan(count);
+    if (count == 16) {
+        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
+    } else {
+        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
+    }
+    u8 last_scale = 8;
+
+    for (s32 index = 0; index < count; index++) {
+        const u8 value = list[start + scan[index]];
+        const s32 delta_scale = static_cast<s32>(value - last_scale);
+
+        WriteSe(delta_scale);
+
+        last_scale = value;
+    }
+}
+
+std::vector<u8>& H264BitWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& H264BitWriter::GetByteArray() const {
+    return byte_array;
+}
+
+void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
+    s32 value_pos = 0;
+
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free_bits = GetFreeBufferBits();
+
+        if (copy_size > free_bits) {
+            copy_size = free_bits;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
+    const s32 sign = value <= 0 ? 0 : 1;
+    if (value < 0) {
+        value = -value;
+    }
+    value = (value << 1) - sign;
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
+    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
+    WriteBits(1, size);
+
+    value -= (1U << (size - 1)) - 1;
+    WriteBits(static_cast<s32>(value), size - 1);
+}
+
+s32 H264BitWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void H264BitWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+
+    buffer = 0;
+    buffer_pos = 0;
+}
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h
new file mode 100644
index 000000000..21752dd90
--- /dev/null
+++ b/src/video_core/command_classes/codecs/h264.h
@@ -0,0 +1,130 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+namespace Decoder {
+
+class H264BitWriter {
+public:
+    H264BitWriter();
+    ~H264BitWriter();
+
+    /// The following Write methods are based on clause 9.1 in the H.264 specification.
+    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
+    void WriteU(s32 value, s32 value_sz);
+    void WriteSe(s32 value);
+    void WriteUe(s32 value);
+
+    /// Finalize the bitstream
+    void End();
+
+    /// append a bit to the stream, equivalent value to the state parameter
+    void WriteBit(bool state);
+
+    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
+    /// Writes the scaling matrices of the sream
+    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
+
+    /// Return the bitstream as a vector.
+    std::vector<u8>& GetByteArray();
+    const std::vector<u8>& GetByteArray() const;
+
+private:
+    // ZigZag LUTs from libavcodec.
+    static constexpr std::array<u8, 64> zig_zag_direct{
+        0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
+        41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
+        30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+    };
+
+    static constexpr std::array<u8, 16> zig_zag_scan{
+        0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+        1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
+    };
+
+    void WriteBits(s32 value, s32 bit_count);
+    void WriteExpGolombCodedInt(s32 value);
+    void WriteExpGolombCodedUInt(u32 value);
+    s32 GetFreeBufferBits();
+    void Flush();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class H264 {
+public:
+    explicit H264(GPU& gpu);
+    ~H264();
+
+    /// Compose the H264 header of the frame for FFmpeg decoding
+    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                        bool is_first_frame = false);
+
+private:
+    struct H264ParameterSet {
+        u32 log2_max_pic_order_cnt{};
+        u32 delta_pic_order_always_zero_flag{};
+        u32 frame_mbs_only_flag{};
+        u32 pic_width_in_mbs{};
+        u32 pic_height_in_map_units{};
+        INSERT_PADDING_WORDS(1);
+        u32 entropy_coding_mode_flag{};
+        u32 bottom_field_pic_order_flag{};
+        u32 num_refidx_l0_default_active{};
+        u32 num_refidx_l1_default_active{};
+        u32 deblocking_filter_control_flag{};
+        u32 redundant_pic_count_flag{};
+        u32 transform_8x8_mode_flag{};
+        INSERT_PADDING_WORDS(9);
+        u64 flags{};
+        u32 frame_number{};
+        u32 frame_number2{};
+    };
+    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_BYTES(0x48);
+        u32 frame_data_size{};
+        INSERT_PADDING_BYTES(0xc);
+        H264ParameterSet h264_parameter_set{};
+        INSERT_PADDING_BYTES(0x100);
+        std::array<u8, 0x60> scaling_matrix_4;
+        std::array<u8, 0x80> scaling_matrix_8;
+    };
+    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
+
+    std::vector<u8> frame;
+    GPU& gpu;
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp
new file mode 100644
index 000000000..3bae0bb5d
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -0,0 +1,1010 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring> // for std::memcpy
+#include <numeric>
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+
+// Default compressed header probabilities once frame context resets
+constexpr Vp9EntropyProbs default_probs{
+    .y_mode_prob{
+        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78,
+        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
+    },
+    .partition_prob{
+        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
+        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0,
+        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0,
+        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0,
+    },
+    .coef_probs{
+        195, 29,  183, 0, 84,  49,  136, 0, 8,   42,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 31,  107, 169, 0, 35,  99,  159, 0, 17,  82,  140, 0, 8,   66,  114, 0,
+        2,   44,  76,  0, 1,   19,  32,  0, 40,  132, 201, 0, 29,  114, 187, 0, 13,  91,  157, 0,
+        7,   75,  127, 0, 3,   58,  95,  0, 1,   28,  47,  0, 69,  142, 221, 0, 42,  122, 201, 0,
+        15,  91,  159, 0, 6,   67,  121, 0, 1,   42,  77,  0, 1,   17,  31,  0, 102, 148, 228, 0,
+        67,  117, 204, 0, 17,  82,  154, 0, 6,   59,  114, 0, 2,   39,  75,  0, 1,   15,  29,  0,
+        156, 57,  233, 0, 119, 57,  212, 0, 58,  48,  163, 0, 29,  40,  124, 0, 12,  30,  81,  0,
+        3,   12,  31,  0, 191, 107, 226, 0, 124, 117, 204, 0, 25,  99,  155, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 29,  148, 210, 0, 37,  126, 194, 0, 8,   93,  157, 0,
+        2,   68,  118, 0, 1,   39,  69,  0, 1,   17,  33,  0, 41,  151, 213, 0, 27,  123, 193, 0,
+        3,   82,  144, 0, 1,   58,  105, 0, 1,   32,  60,  0, 1,   13,  26,  0, 59,  159, 220, 0,
+        23,  126, 198, 0, 4,   88,  151, 0, 1,   66,  114, 0, 1,   38,  71,  0, 1,   18,  34,  0,
+        114, 136, 232, 0, 51,  114, 207, 0, 11,  83,  155, 0, 3,   56,  105, 0, 1,   33,  65,  0,
+        1,   17,  34,  0, 149, 65,  234, 0, 121, 57,  215, 0, 61,  49,  166, 0, 28,  36,  114, 0,
+        12,  25,  76,  0, 3,   16,  42,  0, 214, 49,  220, 0, 132, 63,  188, 0, 42,  65,  137, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  137, 221, 0, 104, 131, 216, 0,
+        49,  111, 192, 0, 21,  87,  155, 0, 2,   49,  87,  0, 1,   16,  28,  0, 89,  163, 230, 0,
+        90,  137, 220, 0, 29,  100, 183, 0, 10,  70,  135, 0, 2,   42,  81,  0, 1,   17,  33,  0,
+        108, 167, 237, 0, 55,  133, 222, 0, 15,  97,  179, 0, 4,   72,  135, 0, 1,   45,  85,  0,
+        1,   19,  38,  0, 124, 146, 240, 0, 66,  124, 224, 0, 17,  88,  175, 0, 4,   58,  122, 0,
+        1,   36,  75,  0, 1,   18,  37,  0, 141, 79,  241, 0, 126, 70,  227, 0, 66,  58,  182, 0,
+        30,  44,  136, 0, 12,  34,  96,  0, 2,   20,  47,  0, 229, 99,  249, 0, 143, 111, 235, 0,
+        46,  109, 192, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 82,  158, 236, 0,
+        94,  146, 224, 0, 25,  117, 191, 0, 9,   87,  149, 0, 3,   56,  99,  0, 1,   33,  57,  0,
+        83,  167, 237, 0, 68,  145, 222, 0, 10,  103, 177, 0, 2,   72,  131, 0, 1,   41,  79,  0,
+        1,   20,  39,  0, 99,  167, 239, 0, 47,  141, 224, 0, 10,  104, 178, 0, 2,   73,  133, 0,
+        1,   44,  85,  0, 1,   22,  47,  0, 127, 145, 243, 0, 71,  129, 228, 0, 17,  93,  177, 0,
+        3,   61,  124, 0, 1,   41,  84,  0, 1,   21,  52,  0, 157, 78,  244, 0, 140, 72,  231, 0,
+        69,  58,  184, 0, 31,  44,  137, 0, 14,  38,  105, 0, 8,   23,  61,  0, 125, 34,  187, 0,
+        52,  41,  133, 0, 6,   31,  56,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        37,  109, 153, 0, 51,  102, 147, 0, 23,  87,  128, 0, 8,   67,  101, 0, 1,   41,  63,  0,
+        1,   19,  29,  0, 31,  154, 185, 0, 17,  127, 175, 0, 6,   96,  145, 0, 2,   73,  114, 0,
+        1,   51,  82,  0, 1,   28,  45,  0, 23,  163, 200, 0, 10,  131, 185, 0, 2,   93,  148, 0,
+        1,   67,  111, 0, 1,   41,  69,  0, 1,   14,  24,  0, 29,  176, 217, 0, 12,  145, 201, 0,
+        3,   101, 156, 0, 1,   69,  111, 0, 1,   39,  63,  0, 1,   14,  23,  0, 57,  192, 233, 0,
+        25,  154, 215, 0, 6,   109, 167, 0, 3,   78,  118, 0, 1,   48,  69,  0, 1,   21,  29,  0,
+        202, 105, 245, 0, 108, 106, 216, 0, 18,  90,  144, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 33,  172, 219, 0, 64,  149, 206, 0, 14,  117, 177, 0, 5,   90,  141, 0,
+        2,   61,  95,  0, 1,   37,  57,  0, 33,  179, 220, 0, 11,  140, 198, 0, 1,   89,  148, 0,
+        1,   60,  104, 0, 1,   33,  57,  0, 1,   12,  21,  0, 30,  181, 221, 0, 8,   141, 198, 0,
+        1,   87,  145, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  20,  0, 32,  186, 224, 0,
+        7,   142, 198, 0, 1,   86,  143, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  22,  0,
+        57,  192, 227, 0, 20,  143, 204, 0, 3,   96,  154, 0, 1,   68,  112, 0, 1,   42,  69,  0,
+        1,   19,  32,  0, 212, 35,  215, 0, 113, 47,  169, 0, 29,  48,  105, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 74,  129, 203, 0, 106, 120, 203, 0, 49,  107, 178, 0,
+        19,  84,  144, 0, 4,   50,  84,  0, 1,   15,  25,  0, 71,  172, 217, 0, 44,  141, 209, 0,
+        15,  102, 173, 0, 6,   76,  133, 0, 2,   51,  89,  0, 1,   24,  42,  0, 64,  185, 231, 0,
+        31,  148, 216, 0, 8,   103, 175, 0, 3,   74,  131, 0, 1,   46,  81,  0, 1,   18,  30,  0,
+        65,  196, 235, 0, 25,  157, 221, 0, 5,   105, 174, 0, 1,   67,  120, 0, 1,   38,  69,  0,
+        1,   15,  30,  0, 65,  204, 238, 0, 30,  156, 224, 0, 7,   107, 177, 0, 2,   70,  124, 0,
+        1,   42,  73,  0, 1,   18,  34,  0, 225, 86,  251, 0, 144, 104, 235, 0, 42,  99,  181, 0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  175, 239, 0, 112, 165, 229, 0,
+        29,  136, 200, 0, 12,  103, 162, 0, 6,   77,  123, 0, 2,   53,  84,  0, 75,  183, 239, 0,
+        30,  155, 221, 0, 3,   106, 171, 0, 1,   74,  128, 0, 1,   44,  76,  0, 1,   17,  28,  0,
+        73,  185, 240, 0, 27,  159, 222, 0, 2,   107, 172, 0, 1,   75,  127, 0, 1,   42,  73,  0,
+        1,   17,  29,  0, 62,  190, 238, 0, 21,  159, 222, 0, 2,   107, 172, 0, 1,   72,  122, 0,
+        1,   40,  71,  0, 1,   18,  32,  0, 61,  199, 240, 0, 27,  161, 226, 0, 4,   113, 180, 0,
+        1,   76,  129, 0, 1,   46,  80,  0, 1,   23,  41,  0, 7,   27,  153, 0, 5,   30,  95,  0,
+        1,   16,  30,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 50,  75,  127, 0,
+        57,  75,  124, 0, 27,  67,  108, 0, 10,  54,  86,  0, 1,   33,  52,  0, 1,   12,  18,  0,
+        43,  125, 151, 0, 26,  108, 148, 0, 7,   83,  122, 0, 2,   59,  89,  0, 1,   38,  60,  0,
+        1,   17,  27,  0, 23,  144, 163, 0, 13,  112, 154, 0, 2,   75,  117, 0, 1,   50,  81,  0,
+        1,   31,  51,  0, 1,   14,  23,  0, 18,  162, 185, 0, 6,   123, 171, 0, 1,   78,  125, 0,
+        1,   51,  86,  0, 1,   31,  54,  0, 1,   14,  23,  0, 15,  199, 227, 0, 3,   150, 204, 0,
+        1,   91,  146, 0, 1,   55,  95,  0, 1,   30,  53,  0, 1,   11,  20,  0, 19,  55,  240, 0,
+        19,  59,  196, 0, 3,   52,  105, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        41,  166, 207, 0, 104, 153, 199, 0, 31,  123, 181, 0, 14,  101, 152, 0, 5,   72,  106, 0,
+        1,   36,  52,  0, 35,  176, 211, 0, 12,  131, 190, 0, 2,   88,  144, 0, 1,   60,  101, 0,
+        1,   36,  60,  0, 1,   16,  28,  0, 28,  183, 213, 0, 8,   134, 191, 0, 1,   86,  142, 0,
+        1,   56,  96,  0, 1,   30,  53,  0, 1,   12,  20,  0, 20,  190, 215, 0, 4,   135, 192, 0,
+        1,   84,  139, 0, 1,   53,  91,  0, 1,   28,  49,  0, 1,   11,  20,  0, 13,  196, 216, 0,
+        2,   137, 192, 0, 1,   86,  143, 0, 1,   57,  99,  0, 1,   32,  56,  0, 1,   13,  24,  0,
+        211, 29,  217, 0, 96,  47,  156, 0, 22,  43,  87,  0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 78,  120, 193, 0, 111, 116, 186, 0, 46,  102, 164, 0, 15,  80,  128, 0,
+        2,   49,  76,  0, 1,   18,  28,  0, 71,  161, 203, 0, 42,  132, 192, 0, 10,  98,  150, 0,
+        3,   69,  109, 0, 1,   44,  70,  0, 1,   18,  29,  0, 57,  186, 211, 0, 30,  140, 196, 0,
+        4,   93,  146, 0, 1,   62,  102, 0, 1,   38,  65,  0, 1,   16,  27,  0, 47,  199, 217, 0,
+        14,  145, 196, 0, 1,   88,  142, 0, 1,   57,  98,  0, 1,   36,  62,  0, 1,   15,  26,  0,
+        26,  219, 229, 0, 5,   155, 207, 0, 1,   94,  151, 0, 1,   60,  104, 0, 1,   36,  62,  0,
+        1,   16,  28,  0, 233, 29,  248, 0, 146, 47,  220, 0, 43,  52,  140, 0, 0,   0,   0,   0,
+        0,   0,   0,   0, 0,   0,   0,   0, 100, 163, 232, 0, 179, 161, 222, 0, 63,  142, 204, 0,
+        37,  113, 174, 0, 26,  89,  137, 0, 18,  68,  97,  0, 85,  181, 230, 0, 32,  146, 209, 0,
+        7,   100, 164, 0, 3,   71,  121, 0, 1,   45,  77,  0, 1,   18,  30,  0, 65,  187, 230, 0,
+        20,  148, 207, 0, 2,   97,  159, 0, 1,   68,  116, 0, 1,   40,  70,  0, 1,   14,  29,  0,
+        40,  194, 227, 0, 8,   147, 204, 0, 1,   94,  155, 0, 1,   65,  112, 0, 1,   39,  66,  0,
+        1,   14,  26,  0, 16,  208, 228, 0, 3,   151, 207, 0, 1,   98,  160, 0, 1,   67,  117, 0,
+        1,   41,  74,  0, 1,   17,  31,  0, 17,  38,  140, 0, 7,   34,  80,  0, 1,   17,  29,  0,
+        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 37,  75,  128, 0, 41,  76,  128, 0,
+        26,  66,  116, 0, 12,  52,  94,  0, 2,   32,  55,  0, 1,   10,  16,  0, 50,  127, 154, 0,
+        37,  109, 152, 0, 16,  82,  121, 0, 5,   59,  85,  0, 1,   35,  54,  0, 1,   13,  20,  0,
+        40,  142, 167, 0, 17,  110, 157, 0, 2,   71,  112, 0, 1,   44,  72,  0, 1,   27,  45,  0,
+        1,   11,  17,  0, 30,  175, 188, 0, 9,   124, 169, 0, 1,   74,  116, 0, 1,   48,  78,  0,
+        1,   30,  49,  0, 1,   11,  18,  0, 10,  222, 223, 0, 2,   150, 194, 0, 1,   83,  128, 0,
+        1,   48,  79,  0, 1,   27,  45,  0, 1,   11,  17,  0, 36,  41,  235, 0, 29,  36,  193, 0,
+        10,  27,  111, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  165, 222, 0,
+        177, 162, 215, 0, 110, 135, 195, 0, 57,  113, 168, 0, 23,  83,  120, 0, 10,  49,  61,  0,
+        85,  190, 223, 0, 36,  139, 200, 0, 5,   90,  146, 0, 1,   60,  103, 0, 1,   38,  65,  0,
+        1,   18,  30,  0, 72,  202, 223, 0, 23,  141, 199, 0, 2,   86,  140, 0, 1,   56,  97,  0,
+        1,   36,  61,  0, 1,   16,  27,  0, 55,  218, 225, 0, 13,  145, 200, 0, 1,   86,  141, 0,
+        1,   57,  99,  0, 1,   35,  61,  0, 1,   13,  22,  0, 15,  235, 212, 0, 1,   132, 184, 0,
+        1,   84,  139, 0, 1,   57,  97,  0, 1,   34,  56,  0, 1,   14,  23,  0, 181, 21,  201, 0,
+        61,  37,  123, 0, 10,  38,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0,
+        47,  106, 172, 0, 95,  104, 173, 0, 42,  93,  159, 0, 18,  77,  131, 0, 4,   50,  81,  0,
+        1,   17,  23,  0, 62,  147, 199, 0, 44,  130, 189, 0, 28,  102, 154, 0, 18,  75,  115, 0,
+        2,   44,  65,  0, 1,   12,  19,  0, 55,  153, 210, 0, 24,  130, 194, 0, 3,   93,  146, 0,
+        1,   61,  97,  0, 1,   31,  50,  0, 1,   10,  16,  0, 49,  186, 223, 0, 17,  148, 204, 0,
+        1,   96,  142, 0, 1,   53,  83,  0, 1,   26,  44,  0, 1,   11,  17,  0, 13,  217, 212, 0,
+        2,   136, 180, 0, 1,   78,  124, 0, 1,   50,  83,  0, 1,   29,  49,  0, 1,   14,  23,  0,
+        197, 13,  247, 0, 82,  17,  222, 0, 25,  17,  162, 0, 0,   0,   0,   0, 0,   0,   0,   0,
+        0,   0,   0,   0, 126, 186, 247, 0, 234, 191, 243, 0, 176, 177, 234, 0, 104, 158, 220, 0,
+        66,  128, 186, 0, 55,  90,  137, 0, 111, 197, 242, 0, 46,  158, 219, 0, 9,   104, 171, 0,
+        2,   65,  125, 0, 1,   44,  80,  0, 1,   17,  91,  0, 104, 208, 245, 0, 39,  168, 224, 0,
+        3,   109, 162, 0, 1,   79,  124, 0, 1,   50,  102, 0, 1,   43,  102, 0, 84,  220, 246, 0,
+        31,  177, 231, 0, 2,   115, 180, 0, 1,   79,  134, 0, 1,   55,  77,  0, 1,   60,  79,  0,
+        43,  243, 240, 0, 8,   180, 217, 0, 1,   115, 166, 0, 1,   84,  121, 0, 1,   51,  67,  0,
+        1,   16,  6,   0,
+    },
+    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
+    .inter_mode_prob{
+        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94,
+        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0,
+    },
+    .intra_inter_prob{9, 102, 187, 225},
+    .comp_inter_prob{9, 102, 187, 225, 0},
+    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
+    .comp_ref_prob{50, 126, 123, 221, 226},
+    .tx_32x32_prob{3, 136, 37, 5, 52, 13},
+    .tx_16x16_prob{20, 152, 15, 101},
+    .tx_8x8_prob{100, 66},
+    .skip_probs{192, 128, 64},
+    .joints{32, 64, 96},
+    .sign{128, 128},
+    .classes{
+        224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
+        216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
+    },
+    .class_0{216, 208},
+    .prob_bits{
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+    },
+    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
+    .fr{64, 96, 64, 64, 96, 64},
+    .class_0_hp{160, 160},
+    .high_precision{128, 128},
+};
+
+VP9::VP9(GPU& gpu) : gpu(gpu) {}
+
+VP9::~VP9() = default;
+
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        WriteProbabilityDelta(writer, new_prob, old_prob);
+    }
+}
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                 const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
+        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
+    }
+}
+
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                         const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
+        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
+    }
+}
+
+void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const int delta = RemapProbability(new_prob, old_prob);
+
+    EncodeTermSubExp(writer, delta);
+}
+
+s32 VP9::RemapProbability(s32 new_prob, s32 old_prob) {
+    new_prob--;
+    old_prob--;
+
+    std::size_t index{};
+
+    if (old_prob * 2 <= 0xff) {
+        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+    } else {
+        index = static_cast<std::size_t>(
+            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+    }
+
+    return map_lut[index];
+}
+
+s32 VP9::RecenterNonNeg(s32 new_prob, s32 old_prob) {
+    if (new_prob > old_prob * 2) {
+        return new_prob;
+    } else if (new_prob >= old_prob) {
+        return (new_prob - old_prob) * 2;
+    } else {
+        return (old_prob - new_prob) * 2 - 1;
+    }
+}
+
+void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
+    if (WriteLessThan(writer, value, 16)) {
+        writer.Write(value, 4);
+    } else if (WriteLessThan(writer, value, 32)) {
+        writer.Write(value - 16, 4);
+    } else if (WriteLessThan(writer, value, 64)) {
+        writer.Write(value - 32, 5);
+    } else {
+        value -= 64;
+
+        constexpr s32 size = 8;
+
+        const s32 mask = (1 << size) - 191;
+
+        const s32 delta = value - mask;
+
+        if (delta < 0) {
+            writer.Write(value, size - 1);
+        } else {
+            writer.Write(delta / 2 + mask, size - 1);
+            writer.Write(delta & 1, 1);
+        }
+    }
+}
+
+bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
+    const bool is_lt = value < test;
+    writer.Write(!is_lt);
+    return is_lt;
+}
+
+void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                     const std::array<u8, 2304>& new_prob,
+                                     const std::array<u8, 2304>& old_prob) {
+    // Note: There's 1 byte added on each packet for alignment,
+    // this byte is ignored when doing updates.
+    constexpr s32 block_bytes = 2 * 2 * 6 * 6 * 4;
+
+    const auto needs_update = [&](s32 base_index) -> bool {
+        s32 index = base_index;
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 6; k++) {
+                    for (s32 l = 0; l < 6; l++) {
+                        if (new_prob[index + 0] != old_prob[index + 0] ||
+                            new_prob[index + 1] != old_prob[index + 1] ||
+                            new_prob[index + 2] != old_prob[index + 2]) {
+                            return true;
+                        }
+
+                        index += 4;
+                    }
+                }
+            }
+        }
+        return false;
+    };
+
+    for (s32 block_index = 0; block_index < 4; block_index++) {
+        const s32 base_index = block_index * block_bytes;
+        const bool update = needs_update(base_index);
+        writer.Write(update);
+
+        if (update) {
+            s32 index = base_index;
+            for (s32 i = 0; i < 2; i++) {
+                for (s32 j = 0; j < 2; j++) {
+                    for (s32 k = 0; k < 6; k++) {
+                        for (s32 l = 0; l < 6; l++) {
+                            if (k != 0 || l < 3) {
+                                WriteProbabilityUpdate(writer, new_prob[index + 0],
+                                                       old_prob[index + 0]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 1],
+                                                       old_prob[index + 1]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 2],
+                                                       old_prob[index + 2]);
+                            }
+                            index += 4;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (block_index == tx_mode) {
+            break;
+        }
+    }
+}
+
+void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        writer.Write(new_prob >> 1, 7);
+    }
+}
+
+s32 VP9::CalcMinLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 min_log2 = 0;
+
+    while ((64 << min_log2) < sb64_cols) {
+        min_log2++;
+    }
+
+    return min_log2;
+}
+
+s32 VP9::CalcMaxLog2TileCols(s32 frameWidth) {
+    const s32 sb64_cols = (frameWidth + 63) / 64;
+    s32 max_log2 = 1;
+
+    while ((sb64_cols >> max_log2) >= 4) {
+        max_log2++;
+    }
+
+    return max_log2 - 1;
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();
+
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+
+    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
+
+    return std::move(vp9_info);
+}
+
+void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
+    EntropyProbs entropy{};
+    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    entropy.Convert(dst);
+}
+
+Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
+    Vp9FrameContainer frame{};
+    {
+        gpu.SyncGuestHost();
+        frame.info = std::move(GetVp9PictureInfo(state));
+
+        frame.bit_stream.resize(frame.info.bitstream_size);
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(),
+                                      frame.info.bitstream_size);
+    }
+    // Buffer two frames, saving the last show frame info
+    if (next_next_frame.bit_stream.size() != 0) {
+        Vp9FrameContainer temp{
+            .info = frame.info,
+            .bit_stream = frame.bit_stream,
+        };
+        next_next_frame.info.show_frame = frame.info.last_frame_shown;
+        frame.info = next_next_frame.info;
+        frame.bit_stream = next_next_frame.bit_stream;
+        next_next_frame = std::move(temp);
+
+        if (next_frame.bit_stream.size() != 0) {
+            Vp9FrameContainer temp{
+                .info = frame.info,
+                .bit_stream = frame.bit_stream,
+            };
+            next_frame.info.show_frame = frame.info.last_frame_shown;
+            frame.info = next_frame.info;
+            frame.bit_stream = next_frame.bit_stream;
+            next_frame = std::move(temp);
+        } else {
+            next_frame.info = frame.info;
+            next_frame.bit_stream = frame.bit_stream;
+        }
+    } else {
+        next_next_frame.info = frame.info;
+        next_next_frame.bit_stream = frame.bit_stream;
+    }
+    return frame;
+}
+
+std::vector<u8> VP9::ComposeCompressedHeader() {
+    VpxRangeEncoder writer{};
+
+    if (!current_frame_info.lossless) {
+        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
+            writer.Write(3, 2);
+            writer.Write(current_frame_info.transform_mode == 4);
+        } else {
+            writer.Write(current_frame_info.transform_mode, 2);
+        }
+    }
+
+    if (current_frame_info.transform_mode == 4) {
+        // tx_mode_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
+                               prev_frame_probs.tx_8x8_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
+                               prev_frame_probs.tx_16x16_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
+                               prev_frame_probs.tx_32x32_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
+            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
+            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
+        }
+    }
+    // read_coef_probs()  in the spec
+    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
+                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
+    // read_skip_probs()  in the spec
+    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
+                           prev_frame_probs.skip_probs);
+
+    if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
+        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
+    }
+
+    if (!current_frame_info.intra_only) {
+        // read_inter_probs() in the spec
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
+                                       prev_frame_probs.inter_mode_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
+        }
+
+        if (current_frame_info.interp_filter == 4) {
+            // read_interp_filter_probs() in the spec
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
+                                   prev_frame_probs.switchable_interp_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.switchable_interp_prob =
+                    current_frame_info.entropy.switchable_interp_prob;
+            }
+        }
+
+        // read_is_inter_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
+                               prev_frame_probs.intra_inter_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
+        }
+        // frame_reference_mode() in the spec
+        if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[2] & 1) ||
+            (current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[3] & 1)) {
+            if (current_frame_info.reference_mode >= 1) {
+                writer.Write(1, 1);
+                writer.Write(current_frame_info.reference_mode == 2);
+            } else {
+                writer.Write(0, 1);
+            }
+        }
+
+        // frame_reference_mode_probs() in the spec
+        if (current_frame_info.reference_mode == 2) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
+                                   prev_frame_probs.comp_inter_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 1) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
+                                   prev_frame_probs.single_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 0) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
+                                   prev_frame_probs.comp_ref_prob);
+            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
+            }
+        }
+
+        // read_y_mode_probs
+        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
+             ++index) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
+                                   prev_frame_probs.y_mode_prob[index]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
+        }
+        // read_partition_probs
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
+                                       prev_frame_probs.partition_prob);
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
+        }
+
+        // mv_probs
+        for (s32 i = 0; i < 3; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
+                                     prev_frame_probs.joints[i]);
+        }
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.joints = current_frame_info.entropy.joints;
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
+                                     prev_frame_probs.sign[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
+                                         prev_frame_probs.classes[index]);
+            }
+
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
+                                     prev_frame_probs.class_0[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
+                                         prev_frame_probs.prob_bits[index]);
+            }
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 3; k++) {
+                    const int index = i * 2 * 3 + j * 3 + k;
+
+                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
+                                             prev_frame_probs.class_0_fr[index]);
+                }
+            }
+
+            for (s32 j = 0; j < 3; j++) {
+                const int index = i * 3 + j;
+
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
+                                         prev_frame_probs.fr[index]);
+            }
+        }
+
+        if (current_frame_info.allow_high_precision_mv) {
+            for (s32 index = 0; index < 2; index++) {
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
+                                         prev_frame_probs.class_0_hp[index]);
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
+                                         prev_frame_probs.high_precision[index]);
+            }
+        }
+
+        // save previous probs
+        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) {
+            prev_frame_probs.sign = current_frame_info.entropy.sign;
+            prev_frame_probs.classes = current_frame_info.entropy.classes;
+            prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
+            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
+            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
+            prev_frame_probs.fr = current_frame_info.entropy.fr;
+            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
+            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
+        }
+    }
+
+    writer.End();
+    return writer.GetBuffer();
+
+    const auto writer_bytearray = writer.GetBuffer();
+
+    std::vector<u8> compressed_header(writer_bytearray.size());
+    std::memcpy(compressed_header.data(), writer_bytearray.data(), writer_bytearray.size());
+    return compressed_header;
+}
+
+VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
+    VpxBitStreamWriter uncomp_writer{};
+
+    uncomp_writer.WriteU(2, 2);                                      // Frame marker.
+    uncomp_writer.WriteU(0, 2);                                      // Profile.
+    uncomp_writer.WriteBit(false);                                   // Show existing frame.
+    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame?
+    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame?
+    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
+
+    if (current_frame_info.is_key_frame) {
+        uncomp_writer.WriteU(frame_sync_code, 24);
+        uncomp_writer.WriteU(0, 3); // Color space.
+        uncomp_writer.WriteU(0, 1); // Color range.
+        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+        uncomp_writer.WriteBit(false); // Render and frame size different.
+
+        // Reset context
+        prev_frame_probs = default_probs;
+        swap_next_golden = false;
+        loop_filter_ref_deltas.fill(0);
+        loop_filter_mode_deltas.fill(0);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});
+
+        // intra only, meaning the frame can be recreated with no other references
+        current_frame_info.intra_only = true;
+
+    } else {
+        std::array<s32, 3> ref_frame_index;
+
+        if (!current_frame_info.show_frame) {
+            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
+        } else {
+            current_frame_info.intra_only = false;
+        }
+        if (!current_frame_info.error_resilient_mode) {
+            uncomp_writer.WriteU(0, 2); // Reset frame context.
+        }
+
+        // Last, Golden, Altref frames
+        ref_frame_index = std::array<s32, 3>{0, 1, 2};
+
+        // set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
+            }
+        }
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
+        if (current_frame_info.intra_only) {
+            uncomp_writer.WriteU(frame_sync_code, 24);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+        } else {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
+                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
+                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
+            }
+
+            uncomp_writer.WriteBit(true);  // Frame size with refs.
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
+            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
+
+            if (current_frame_info.interp_filter != 4) {
+                uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
+            }
+        }
+    }
+
+    if (!current_frame_info.error_resilient_mode) {
+        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
+        uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
+    }
+
+    int frame_ctx_idx = 0;
+    if (!current_frame_info.show_frame) {
+        frame_ctx_idx = 1;
+    }
+
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+
+    uncomp_writer.WriteU(current_frame_info.first_level, 6);
+    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
+    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
+
+    if (current_frame_info.mode_ref_delta_enabled) {
+        // check if ref deltas are different, update accordingly
+        std::array<bool, 4> update_loop_filter_ref_deltas;
+        std::array<bool, 2> update_loop_filter_mode_deltas;
+
+        bool loop_filter_delta_update = false;
+
+        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_ref_deltas[index];
+            const s8 new_deltas = current_frame_info.ref_deltas[index];
+
+            loop_filter_delta_update |=
+                (update_loop_filter_ref_deltas[index] = old_deltas != new_deltas);
+        }
+
+        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_mode_deltas[index];
+            const s8 new_deltas = current_frame_info.mode_deltas[index];
+
+            loop_filter_delta_update |=
+                (update_loop_filter_mode_deltas[index] = old_deltas != new_deltas);
+        }
+
+        uncomp_writer.WriteBit(loop_filter_delta_update);
+
+        if (loop_filter_delta_update) {
+            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
+
+                if (update_loop_filter_ref_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
+                }
+            }
+
+            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
+
+                if (update_loop_filter_mode_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
+                }
+            }
+            // save new deltas
+            loop_filter_ref_deltas = current_frame_info.ref_deltas;
+            loop_filter_mode_deltas = current_frame_info.mode_deltas;
+        }
+    }
+
+    uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
+
+    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
+
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+
+    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
+    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
+
+    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
+    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
+
+    // If it's less than the maximum, we need to add an extra 0 on the bitstream
+    // to indicate that it should stop reading.
+    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
+    } else {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
+    }
+
+    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
+
+    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
+
+    if (tile_rows_log2_is_nonzero) {
+        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
+    }
+
+    return uncomp_writer;
+}
+
+std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) {
+    std::vector<u8> bitstream;
+    {
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        current_frame_info = curr_frame.info;
+        bitstream = curr_frame.bit_stream;
+    }
+
+    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
+    auto uncomp_writer = ComposeUncompressedHeader();
+    std::vector<u8> compressed_header = ComposeCompressedHeader();
+
+    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
+    uncomp_writer.Flush();
+    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
+
+    // Write headers and frame to buffer
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
+                compressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
+                bitstream.data(), bitstream.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
+    return frame;
+}
+
+VpxRangeEncoder::VpxRangeEncoder() {
+    Write(false);
+}
+
+VpxRangeEncoder::~VpxRangeEncoder() = default;
+
+void VpxRangeEncoder::Write(s32 value, s32 value_size) {
+    for (s32 bit = value_size - 1; bit >= 0; bit--) {
+        Write(((value >> bit) & 1) != 0);
+    }
+}
+
+void VpxRangeEncoder::Write(bool bit) {
+    Write(bit, half_probability);
+}
+
+void VpxRangeEncoder::Write(bool bit, s32 probability) {
+    u32 local_range = range;
+    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
+    local_range = split;
+
+    if (bit) {
+        low_value += split;
+        local_range = range - split;
+    }
+
+    s32 shift = norm_lut[local_range];
+    local_range <<= shift;
+    count += shift;
+
+    if (count >= 0) {
+        const s32 offset = shift - count;
+
+        if (((low_value << (offset - 1)) >> 31) != 0) {
+            const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
+            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+            while (base_stream.GetPosition() >= 0 && PeekByte() == 0xff) {
+                base_stream.WriteByte(0);
+
+                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
+            }
+            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
+            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
+        }
+        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
+
+        low_value <<= offset;
+        shift = count;
+        low_value &= 0xffffff;
+        count -= 8;
+    }
+
+    low_value <<= shift;
+    range = local_range;
+}
+
+void VpxRangeEncoder::End() {
+    for (std::size_t index = 0; index < 32; ++index) {
+        Write(false);
+    }
+}
+
+u8 VpxRangeEncoder::PeekByte() {
+    const u8 value = base_stream.ReadByte();
+    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+
+    return value;
+}
+
+VpxBitStreamWriter::VpxBitStreamWriter() = default;
+
+VpxBitStreamWriter::~VpxBitStreamWriter() = default;
+
+void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
+    WriteBits(value, value_size);
+}
+
+void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
+    const bool sign = value < 0;
+    if (sign) {
+        value = -value;
+    }
+
+    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
+}
+
+void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
+    const bool delta_coded = value != 0;
+    WriteBit(delta_coded);
+
+    if (delta_coded) {
+        WriteBits(value, 4);
+    }
+}
+
+void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
+    s32 value_pos = 0;
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free = GetFreeBufferBits();
+
+        if (copy_size > free) {
+            copy_size = free;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void VpxBitStreamWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+s32 VpxBitStreamWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void VpxBitStreamWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+    buffer = 0;
+    buffer_pos = 0;
+}
+
+std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
+    return byte_array;
+}
+
+} // namespace Tegra::Decoder
diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h
new file mode 100644
index 000000000..748e11bae
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -0,0 +1,216 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "common/stream.h"
+#include "video_core/command_classes/codecs/vp9_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
+namespace Decoder {
+
+/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
+/// VP9 header bitstreams.
+
+class VpxRangeEncoder {
+public:
+    VpxRangeEncoder();
+    ~VpxRangeEncoder();
+
+    /// Writes the rightmost value_size bits from value into the stream
+    void Write(s32 value, s32 value_size);
+
+    /// Writes a single bit with half probability
+    void Write(bool bit);
+
+    /// Writes a bit to the base_stream encoded with probability
+    void Write(bool bit, s32 probability);
+
+    /// Signal the end of the bitstream
+    void End();
+
+    std::vector<u8>& GetBuffer() {
+        return base_stream.GetBuffer();
+    }
+
+    const std::vector<u8>& GetBuffer() const {
+        return base_stream.GetBuffer();
+    }
+
+private:
+    u8 PeekByte();
+    Common::Stream base_stream{};
+    u32 low_value{};
+    u32 range{0xff};
+    s32 count{-24};
+    s32 half_probability{128};
+    static constexpr std::array<s32, 256> norm_lut{
+        0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+        3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+};
+
+class VpxBitStreamWriter {
+public:
+    VpxBitStreamWriter();
+    ~VpxBitStreamWriter();
+
+    /// Write an unsigned integer value
+    void WriteU(u32 value, u32 value_size);
+
+    /// Write a signed integer value
+    void WriteS(s32 value, u32 value_size);
+
+    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
+    void WriteDeltaQ(u32 value);
+
+    /// Write a single bit.
+    void WriteBit(bool state);
+
+    /// Pushes current buffer into buffer_array, resets buffer
+    void Flush();
+
+    /// Returns byte_array
+    std::vector<u8>& GetByteArray();
+
+    /// Returns const byte_array
+    const std::vector<u8>& GetByteArray() const;
+
+private:
+    /// Write bit_count bits from value into buffer
+    void WriteBits(u32 value, u32 bit_count);
+
+    /// Gets next available position in buffer, invokes Flush() if buffer is full
+    s32 GetFreeBufferBits();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class VP9 {
+public:
+    explicit VP9(GPU& gpu);
+    ~VP9();
+
+    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
+    /// documentation
+    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
+
+    /// Returns true if the most recent frame was a hidden frame.
+    bool WasFrameHidden() const {
+        return hidden;
+    }
+
+private:
+    /// Generates compressed header probability updates in the bitstream writer
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                const std::array<T, N>& old_prob);
+
+    /// Generates compressed header probability updates in the bitstream writer
+    /// If probs are not equal, WriteProbabilityDelta is invoked
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Generates compressed header probability deltas in the bitstream writer
+    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
+    s32 RemapProbability(s32 new_prob, s32 old_prob);
+
+    /// Recenters probability. Based on section 6.3.6 of VP9 Specification
+    s32 RecenterNonNeg(s32 new_prob, s32 old_prob);
+
+    /// Inverse of 6.3.4 Decode term subexp
+    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
+
+    /// Writes if the value is less than the test value
+    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
+
+    /// Writes probability updates for the Coef probabilities
+    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                    const std::array<u8, 2304>& new_prob,
+                                    const std::array<u8, 2304>& old_prob);
+
+    /// Write probabilities for 4-byte aligned structures
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                        const std::array<T, N>& old_prob);
+
+    /// Write motion vector probability updates. 6.3.17 in the spec
+    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// 6.2.14 Tile size calculation
+    s32 CalcMinLog2TileCols(s32 frame_width);
+    s32 CalcMaxLog2TileCols(s32 frame_width);
+
+    /// Returns VP9 information from NVDEC provided offset and size
+    Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
+
+    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
+    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
+
+    /// Returns frame to be decoded after buffering
+    Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
+
+    /// Use NVDEC providied information to compose the headers for the current frame
+    std::vector<u8> ComposeCompressedHeader();
+    VpxBitStreamWriter ComposeUncompressedHeader();
+
+    GPU& gpu;
+    std::vector<u8> frame;
+
+    std::array<s8, 4> loop_filter_ref_deltas{};
+    std::array<s8, 2> loop_filter_mode_deltas{};
+
+    bool hidden;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
+    Vp9FrameContainer next_frame{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};
+
+    Vp9PictureInfo current_frame_info{};
+    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
+    static constexpr std::array<s32, 254> map_lut = {
+        20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+        36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,
+        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,
+        67,  68,  69,  70,  71,  72,  73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,
+        83,  84,  85,  5,   86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  6,
+        98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113,
+        114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129,
+        130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
+        10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159, 160,
+        161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176,
+        177, 178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
+        193, 14,  194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207,
+        208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223,
+        224, 225, 226, 227, 228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+        240, 241, 18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+    };
+};
+
+} // namespace Decoder
+} // namespace Tegra
diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h
new file mode 100644
index 000000000..8688fdac0
--- /dev/null
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -0,0 +1,369 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <list>
+#include <vector>
+#include "common/cityhash.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+
+namespace Decoder {
+struct Vp9FrameDimensions {
+    s16 width{};
+    s16 height{};
+    s16 luma_pitch{};
+    s16 chroma_pitch{};
+};
+static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
+
+enum FrameFlags : u32 {
+    IsKeyFrame = 1 << 0,
+    LastFrameIsKeyFrame = 1 << 1,
+    FrameSizeChanged = 1 << 2,
+    ErrorResilientMode = 1 << 3,
+    LastShowFrame = 1 << 4,
+    IntraOnly = 1 << 5,
+};
+
+enum class MvJointType {
+    MvJointZero = 0,   /* Zero vector */
+    MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
+    MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
+    MvJointHnzvnz = 3, /* Both components nonzero */
+};
+enum class MvClassType {
+    MvClass0 = 0,   /* (0, 2]     integer pel */
+    MvClass1 = 1,   /* (2, 4]     integer pel */
+    MvClass2 = 2,   /* (4, 8]     integer pel */
+    MvClass3 = 3,   /* (8, 16]    integer pel */
+    MvClass4 = 4,   /* (16, 32]   integer pel */
+    MvClass5 = 5,   /* (32, 64]   integer pel */
+    MvClass6 = 6,   /* (64, 128]  integer pel */
+    MvClass7 = 7,   /* (128, 256] integer pel */
+    MvClass8 = 8,   /* (256, 512] integer pel */
+    MvClass9 = 9,   /* (512, 1024] integer pel */
+    MvClass10 = 10, /* (1024,2048] integer pel */
+};
+
+enum class BlockSize {
+    Block4x4 = 0,
+    Block4x8 = 1,
+    Block8x4 = 2,
+    Block8x8 = 3,
+    Block8x16 = 4,
+    Block16x8 = 5,
+    Block16x16 = 6,
+    Block16x32 = 7,
+    Block32x16 = 8,
+    Block32x32 = 9,
+    Block32x64 = 10,
+    Block64x32 = 11,
+    Block64x64 = 12,
+    BlockSizes = 13,
+    BlockInvalid = BlockSizes
+};
+
+enum class PredictionMode {
+    DcPred = 0,   // Average of above and left pixels
+    VPred = 1,    // Vertical
+    HPred = 2,    // Horizontal
+    D45Pred = 3,  // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
+    D135Pred = 4, // Directional 135 deg = 180 - 45
+    D117Pred = 5, // Directional 117 deg = 180 - 63
+    D153Pred = 6, // Directional 153 deg = 180 - 27
+    D207Pred = 7, // Directional 207 deg = 180 + 27
+    D63Pred = 8,  // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
+    TmPred = 9,   // True-motion
+    NearestMv = 10,
+    NearMv = 11,
+    ZeroMv = 12,
+    NewMv = 13,
+    MbModeCount = 14
+};
+
+enum class TxSize {
+    Tx4x4 = 0,   // 4x4 transform
+    Tx8x8 = 1,   // 8x8 transform
+    Tx16x16 = 2, // 16x16 transform
+    Tx32x32 = 3, // 32x32 transform
+    TxSizes = 4
+};
+
+enum class TxMode {
+    Only4X4 = 0,      // Only 4x4 transform used
+    Allow8X8 = 1,     // Allow block transform size up to 8x8
+    Allow16X16 = 2,   // Allow block transform size up to 16x16
+    Allow32X32 = 3,   // Allow block transform size up to 32x32
+    TxModeSelect = 4, // Transform specified for each block
+    TxModes = 5
+};
+
+enum class reference_mode {
+    SingleReference = 0,
+    CompoundReference = 1,
+    ReferenceModeSelect = 2,
+    ReferenceModes = 3
+};
+
+struct Segmentation {
+    u8 enabled{};
+    u8 update_map{};
+    u8 temporal_update{};
+    u8 abs_delta{};
+    std::array<u32, 8> feature_mask{};
+    std::array<std::array<s16, 4>, 8> feature_data{};
+};
+static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
+
+struct LoopFilter {
+    u8 mode_ref_delta_enabled{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+};
+static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
+
+struct Vp9EntropyProbs {
+    std::array<u8, 36> y_mode_prob{};
+    std::array<u8, 64> partition_prob{};
+    std::array<u8, 2304> coef_probs{};
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 3> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 20> prob_bits{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+};
+static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size");
+
+struct Vp9PictureInfo {
+    bool is_key_frame{};
+    bool intra_only{};
+    bool last_frame_was_key{};
+    bool frame_size_changed{};
+    bool error_resilient_mode{};
+    bool last_frame_shown{};
+    bool show_frame{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    s32 base_q_index{};
+    s32 y_dc_delta_q{};
+    s32 uv_dc_delta_q{};
+    s32 uv_ac_delta_q{};
+    bool lossless{};
+    s32 transform_mode{};
+    bool allow_high_precision_mv{};
+    s32 interp_filter{};
+    s32 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    s32 log2_tile_cols{};
+    s32 log2_tile_rows{};
+    bool segment_enabled{};
+    bool segment_map_update{};
+    bool segment_map_temporal_update{};
+    s32 segment_abs_delta{};
+    std::array<u32, 8> segment_feature_enable{};
+    std::array<std::array<s16, 4>, 8> segment_feature_data{};
+    bool mode_ref_delta_enabled{};
+    bool use_prev_in_find_mv_refs{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+    Vp9EntropyProbs entropy{};
+    Vp9FrameDimensions frame_size{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u32 bitstream_size{};
+    std::array<u64, 4> frame_offsets{};
+    std::array<bool, 4> refresh_frame{};
+};
+
+struct Vp9FrameContainer {
+    Vp9PictureInfo info{};
+    std::vector<u8> bit_stream;
+};
+
+struct PictureInfo {
+    INSERT_PADDING_WORDS(12);
+    u32 bitstream_size{};
+    INSERT_PADDING_WORDS(5);
+    Vp9FrameDimensions last_frame_size{};
+    Vp9FrameDimensions golden_frame_size{};
+    Vp9FrameDimensions alt_frame_size{};
+    Vp9FrameDimensions current_frame_size{};
+    u32 vp9_flags{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u8 base_q_index{};
+    u8 y_dc_delta_q{};
+    u8 uv_ac_delta_q{};
+    u8 uv_dc_delta_q{};
+    u8 lossless{};
+    u8 tx_mode{};
+    u8 allow_high_precision_mv{};
+    u8 interp_filter{};
+    u8 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    u8 log2_tile_cols{};
+    u8 log2_tile_rows{};
+    Segmentation segmentation{};
+    LoopFilter loop_filter{};
+    INSERT_PADDING_BYTES(5);
+    u32 surface_params{};
+    INSERT_PADDING_WORDS(3);
+
+    Vp9PictureInfo Convert() const {
+
+        return Vp9PictureInfo{
+            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
+            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
+            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
+            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
+            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
+            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
+            .ref_frame_sign_bias = ref_frame_sign_bias,
+            .base_q_index = base_q_index,
+            .y_dc_delta_q = y_dc_delta_q,
+            .uv_dc_delta_q = uv_dc_delta_q,
+            .uv_ac_delta_q = uv_ac_delta_q,
+            .lossless = lossless != 0,
+            .transform_mode = tx_mode,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .interp_filter = interp_filter,
+            .reference_mode = reference_mode,
+            .comp_fixed_ref = comp_fixed_ref,
+            .comp_var_ref = comp_var_ref,
+            .log2_tile_cols = log2_tile_cols,
+            .log2_tile_rows = log2_tile_rows,
+            .segment_enabled = segmentation.enabled != 0,
+            .segment_map_update = segmentation.update_map != 0,
+            .segment_map_temporal_update = segmentation.temporal_update != 0,
+            .segment_abs_delta = segmentation.abs_delta,
+            .segment_feature_enable = segmentation.feature_mask,
+            .segment_feature_data = segmentation.feature_data,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
+            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
+                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
+                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
+                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
+                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
+            .ref_deltas = loop_filter.ref_deltas,
+            .mode_deltas = loop_filter.mode_deltas,
+            .frame_size = current_frame_size,
+            .first_level = first_level,
+            .sharpness_level = sharpness_level,
+            .bitstream_size = bitstream_size,
+        };
+    }
+};
+static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
+
+struct EntropyProbs {
+    INSERT_PADDING_BYTES(1024);
+    std::array<std::array<u8, 4>, 7> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    INSERT_PADDING_BYTES(80);
+    std::array<std::array<u8, 1>, 2> tx_8x8_prob{};
+    std::array<std::array<u8, 2>, 2> tx_16x16_prob{};
+    std::array<std::array<u8, 3>, 2> tx_32x32_prob{};
+    std::array<u8, 4> y_mode_prob_e8{};
+    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
+    INSERT_PADDING_BYTES(64);
+    std::array<std::array<u8, 4>, 16> partition_prob{};
+    INSERT_PADDING_BYTES(10);
+    std::array<std::array<u8, 2>, 4> switchable_interp_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 4> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<std::array<u8, 1>, 2> class_0{};
+    std::array<std::array<u8, 3>, 2> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+    std::array<std::array<u8, 10>, 2> classes{};
+    std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{};
+    std::array<std::array<u8, 10>, 2> pred_bits{};
+    std::array<std::array<u8, 2>, 5> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    INSERT_PADDING_BYTES(17);
+    std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4>
+        coef_probs{};
+
+    void Convert(Vp9EntropyProbs& fc) {
+        std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size());
+
+        std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(),
+                    fc.intra_inter_prob.size());
+
+        std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size());
+        std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size());
+        std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size());
+
+        for (s32 i = 0; i < 4; i++) {
+            for (s32 j = 0; j < 9; j++) {
+                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
+            }
+        }
+
+        std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size());
+
+        std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(),
+                    fc.switchable_interp_prob.size());
+        std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size());
+        std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size());
+
+        std::memcpy(fc.joints.data(), joints.data(), fc.joints.size());
+
+        std::memcpy(fc.sign.data(), sign.data(), fc.sign.size());
+        std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size());
+        std::memcpy(fc.fr.data(), fr.data(), fc.fr.size());
+        std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size());
+        std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size());
+        std::memcpy(fc.classes.data(), classes.data(), fc.classes.size());
+        std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size());
+        std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size());
+        std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size());
+        std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size());
+
+        std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size());
+    }
+};
+static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
+
+enum class Ref { Last, Golden, AltRef };
+
+struct RefPoolElement {
+    s64 frame{};
+    Ref ref{};
+    bool refresh{};
+};
+
+struct FrameContexts {
+    s64 from{};
+    bool adapted{};
+    Vp9EntropyProbs probs{};
+};
+
+}; // namespace Decoder
+}; // namespace Tegra
diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp
new file mode 100644
index 000000000..a5234ee47
--- /dev/null
+++ b/src/video_core/command_classes/host1x.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/host1x.h"
+#include "video_core/gpu.h"
+
+Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
+
+Tegra::Host1x::~Host1x() = default;
+
+void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Tegra::Host1x::ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments) {
+    StateWrite(static_cast<u32>(method), arguments[0]);
+    switch (method) {
+    case Method::WaitSyncpt:
+        Execute(arguments[0]);
+        break;
+    case Method::LoadSyncptPayload32:
+        syncpoint_value = arguments[0];
+        break;
+    case Method::WaitSyncpt32:
+        Execute(arguments[0]);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
+        break;
+    }
+}
+
+void Tegra::Host1x::Execute(u32 data) {
+    // This method waits on a valid syncpoint.
+    // TODO: Implement when proper Async is in place
+}
diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h
new file mode 100644
index 000000000..501a5ed2e
--- /dev/null
+++ b/src/video_core/command_classes/host1x.h
@@ -0,0 +1,78 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+class Host1x {
+public:
+    struct Host1xClassRegisters {
+        u32 incr_syncpt{};
+        u32 incr_syncpt_ctrl{};
+        u32 incr_syncpt_error{};
+        INSERT_PADDING_WORDS(5);
+        u32 wait_syncpt{};
+        u32 wait_syncpt_base{};
+        u32 wait_syncpt_incr{};
+        u32 load_syncpt_base{};
+        u32 incr_syncpt_base{};
+        u32 clear{};
+        u32 wait{};
+        u32 wait_with_interrupt{};
+        u32 delay_use{};
+        u32 tick_count_high{};
+        u32 tick_count_low{};
+        u32 tick_ctrl{};
+        INSERT_PADDING_WORDS(23);
+        u32 ind_ctrl{};
+        u32 ind_off2{};
+        u32 ind_off{};
+        std::array<u32, 31> ind_data{};
+        INSERT_PADDING_WORDS(1);
+        u32 load_syncpoint_payload32{};
+        u32 stall_ctrl{};
+        u32 wait_syncpt32{};
+        u32 wait_syncpt_base32{};
+        u32 load_syncpt_base32{};
+        u32 incr_syncpt_base32{};
+        u32 stall_count_high{};
+        u32 stall_count_low{};
+        u32 xref_ctrl{};
+        u32 channel_xref_high{};
+        u32 channel_xref_low{};
+    };
+    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
+
+    enum class Method : u32 {
+        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
+        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
+        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
+    };
+
+    explicit Host1x(GPU& gpu);
+    ~Host1x();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments);
+
+private:
+    /// For Host1x, execute is waiting on a syncpoint previously written into the state
+    void Execute(u32 data);
+
+    /// Write argument into the provided offset
+    void StateWrite(u32 offset, u32 arguments);
+
+    u32 syncpoint_value{};
+    Host1xClassRegisters state{};
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp
new file mode 100644
index 000000000..ede9466eb
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -0,0 +1,56 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <bitset>
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "core/memory.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+
+Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
+
+Nvdec::~Nvdec() = default;
+
+void Nvdec::ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments) {
+    if (method == Method::SetVideoCodec) {
+        codec->StateWrite(static_cast<u32>(method), arguments[0]);
+    } else {
+        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
+    }
+
+    switch (method) {
+    case Method::SetVideoCodec:
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
+        break;
+    case Method::Execute:
+        Execute();
+        break;
+    }
+}
+
+AVFrame* Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
+}
+
+const AVFrame* Nvdec::GetFrame() const {
+    return codec->GetCurrentFrame();
+}
+
+void Nvdec::Execute() {
+    switch (codec->GetCurrentCodec()) {
+    case NvdecCommon::VideoCodec::H264:
+    case NvdecCommon::VideoCodec::Vp9:
+        codec->Decode();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h
new file mode 100644
index 000000000..c1a9d843e
--- /dev/null
+++ b/src/video_core/command_classes/nvdec.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/codecs/codec.h"
+
+namespace Tegra {
+class GPU;
+
+class Nvdec {
+public:
+    enum class Method : u32 {
+        SetVideoCodec = 0x80,
+        Execute = 0xc0,
+    };
+
+    explicit Nvdec(GPU& gpu);
+    ~Nvdec();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments);
+
+    /// Return most recently decoded frame
+    AVFrame* GetFrame();
+    const AVFrame* GetFrame() const;
+
+private:
+    /// Invoke codec to decode a frame
+    void Execute();
+
+    GPU& gpu;
+    std::unique_ptr<Tegra::Codec> codec;
+};
+} // namespace Tegra
diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h
new file mode 100644
index 000000000..01b5e086d
--- /dev/null
+++ b/src/video_core/command_classes/nvdec_common.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::NvdecCommon {
+
+struct NvdecRegisters {
+    INSERT_PADDING_WORDS(256);
+    u64 set_codec_id{};
+    INSERT_PADDING_WORDS(254);
+    u64 set_platform_id{};
+    u64 picture_info_offset{};
+    u64 frame_bitstream_offset{};
+    u64 frame_number{};
+    u64 h264_slice_data_offsets{};
+    u64 h264_mv_dump_offset{};
+    INSERT_PADDING_WORDS(6);
+    u64 frame_stats_offset{};
+    u64 h264_last_surface_luma_offset{};
+    u64 h264_last_surface_chroma_offset{};
+    std::array<u64, 17> surface_luma_offset{};
+    std::array<u64, 17> surface_chroma_offset{};
+    INSERT_PADDING_WORDS(132);
+    u64 vp9_entropy_probs_offset{};
+    u64 vp9_backward_updates_offset{};
+    u64 vp9_last_frame_segmap_offset{};
+    u64 vp9_curr_frame_segmap_offset{};
+    INSERT_PADDING_WORDS(2);
+    u64 vp9_last_frame_mvs_offset{};
+    u64 vp9_curr_frame_mvs_offset{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
+
+enum class VideoCodec : u32 {
+    None = 0x0,
+    H264 = 0x3,
+    Vp8 = 0x5,
+    H265 = 0x7,
+    Vp9 = 0x9,
+};
+
+} // namespace Tegra::NvdecCommon
diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp
new file mode 100644
index 000000000..a0ab44855
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.cpp
@@ -0,0 +1,60 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <algorithm>
+#include "sync_manager.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
+SyncptIncrManager::~SyncptIncrManager() = default;
+
+void SyncptIncrManager::Increment(u32 id) {
+    increments.push_back(SyncptIncr{0, id, true});
+    IncrementAllDone();
+}
+
+u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
+    const u32 handle = current_id++;
+    increments.push_back(SyncptIncr{handle, class_id, id});
+    return handle;
+}
+
+void SyncptIncrManager::SignalDone(u32 handle) {
+    auto done_incr = std::find_if(increments.begin(), increments.end(),
+                                  [handle](SyncptIncr incr) { return incr.id == handle; });
+    if (done_incr != increments.end()) {
+        const SyncptIncr incr = *done_incr;
+        *done_incr = SyncptIncr{incr.id, incr.class_id, incr.syncpt_id, true};
+    }
+    IncrementAllDone();
+}
+
+void SyncptIncrManager::IncrementAllDone() {
+    std::size_t done_count = 0;
+    for (; done_count < increments.size(); ++done_count) {
+        if (!increments[done_count].complete) {
+            break;
+        }
+        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
+    }
+    increments.erase(increments.begin(), increments.begin() + done_count);
+}
+} // namespace Tegra
diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h
new file mode 100644
index 000000000..353b67573
--- /dev/null
+++ b/src/video_core/command_classes/sync_manager.h
@@ -0,0 +1,64 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+struct SyncptIncr {
+    u32 id;
+    u32 class_id;
+    u32 syncpt_id;
+    bool complete;
+
+    SyncptIncr(u32 id, u32 syncpt_id_, u32 class_id_, bool done = false)
+        : id(id), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
+};
+
+class SyncptIncrManager {
+public:
+    explicit SyncptIncrManager(GPU& gpu);
+    ~SyncptIncrManager();
+
+    /// Add syncpoint id and increment all
+    void Increment(u32 id);
+
+    /// Returns a handle to increment later
+    u32 IncrementWhenDone(u32 class_id, u32 id);
+
+    /// IncrememntAllDone, including handle
+    void SignalDone(u32 handle);
+
+    /// Increment all sequential pending increments that are already done.
+    void IncrementAllDone();
+
+private:
+    std::vector<SyncptIncr> increments;
+    std::mutex increment_lock;
+    u32 current_id{};
+
+    GPU& gpu;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
new file mode 100644
index 000000000..66e15a1a8
--- /dev/null
+++ b/src/video_core/command_classes/vic.cpp
@@ -0,0 +1,180 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/command_classes/vic.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/texture_cache/surface_params.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace Tegra {
+
+Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
+    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
+Vic::~Vic() = default;
+
+void Vic::VicStateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Vic::ProcessMethod(Vic::Method method, const std::vector<u32>& arguments) {
+    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
+    VicStateWrite(static_cast<u32>(method), arguments[0]);
+    const u64 arg = static_cast<u64>(arguments[0]) << 8;
+    switch (method) {
+    case Method::Execute:
+        Execute();
+        break;
+    case Method::SetConfigStructOffset:
+        config_struct_address = arg;
+        break;
+    case Method::SetOutputSurfaceLumaOffset:
+        output_surface_luma_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaUOffset:
+        output_surface_chroma_u_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaVOffset:
+        output_surface_chroma_v_address = arg;
+        break;
+    default:
+        break;
+    }
+}
+
+void Vic::Execute() {
+    if (output_surface_luma_address == 0) {
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
+                  vic_state.output_surface.luma_offset);
+        return;
+    }
+    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
+    const VideoPixelFormat pixel_format =
+        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    switch (pixel_format) {
+    case VideoPixelFormat::BGRA8:
+    case VideoPixelFormat::RGBA8: {
+        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+        if (scaler_ctx == nullptr || frame->width != scaler_width ||
+            frame->height != scaler_height) {
+            const AVPixelFormat target_format =
+                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+
+            sws_freeContext(scaler_ctx);
+            scaler_ctx = nullptr;
+
+            // FFmpeg returns all frames in YUV420, convert it into expected format
+            scaler_ctx =
+                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+
+            scaler_width = frame->width;
+            scaler_height = frame->height;
+        }
+        // Get Converted frame
+        const std::size_t linear_size = frame->width * frame->height * 4;
+
+        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
+
+        const int converted_stride{frame->width * 4};
+        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+
+        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
+                  &converted_frame_buf_addr, &converted_stride);
+
+        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+        if (blk_kind != 0) {
+            // swizzle pitch linear to block linear
+            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+                                                            block_height, 0);
+            std::vector<u8> swizzled_data(size);
+            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
+                                             swizzled_data.data(), converted_frame_buffer.get(),
+                                             false, block_height, 0, 1);
+
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        } else {
+            // send pitch linear frame
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                           linear_size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case VideoPixelFormat::Yuv420: {
+        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+
+        const auto* frame = nvdec_processor->GetFrame();
+
+        if (!frame || frame->width == 0 || frame->height == 0) {
+            return;
+        }
+
+        const std::size_t surface_width = config.surface_width_minus1 + 1;
+        const std::size_t surface_height = config.surface_height_minus1 + 1;
+        const std::size_t half_width = surface_width / 2;
+        const std::size_t half_height = config.surface_height_minus1 / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+
+        const auto* luma_ptr = frame->data[0];
+        const auto* chroma_b_ptr = frame->data[1];
+        const auto* chroma_r_ptr = frame->data[2];
+        const auto stride = frame->linesize[0];
+        const auto half_stride = frame->linesize[1];
+
+        std::vector<u8> luma_buffer(aligned_width * surface_height);
+        std::vector<u8> chroma_buffer(aligned_width * half_height);
+
+        // Populate luma buffer
+        for (std::size_t y = 0; y < surface_height - 1; ++y) {
+            std::size_t src = y * stride;
+            std::size_t dst = y * aligned_width;
+
+            std::size_t size = surface_width;
+
+            for (std::size_t offset = 0; offset < size; ++offset) {
+                luma_buffer[dst + offset] = luma_ptr[src + offset];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                       luma_buffer.size());
+
+        // Populate chroma buffer from both channels with interleaving.
+        for (std::size_t y = 0; y < half_height; ++y) {
+            std::size_t src = y * half_stride;
+            std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+                                       chroma_buffer.size());
+        gpu.Maxwell3D().OnMemoryWrite();
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        break;
+    }
+}
+
+} // namespace Tegra
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
new file mode 100644
index 000000000..dd0a2aed8
--- /dev/null
+++ b/src/video_core/command_classes/vic.h
@@ -0,0 +1,110 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+struct SwsContext;
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+struct PlaneOffsets {
+    u32 luma_offset{};
+    u32 chroma_u_offset{};
+    u32 chroma_v_offset{};
+};
+
+struct VicRegisters {
+    INSERT_PADDING_WORDS(64);
+    u32 nop{};
+    INSERT_PADDING_WORDS(15);
+    u32 pm_trigger{};
+    INSERT_PADDING_WORDS(47);
+    u32 set_application_id{};
+    u32 set_watchdog_timer{};
+    INSERT_PADDING_WORDS(17);
+    u32 context_save_area{};
+    u32 context_switch{};
+    INSERT_PADDING_WORDS(43);
+    u32 execute{};
+    INSERT_PADDING_WORDS(63);
+    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
+    u32 picture_index{};
+    u32 control_params{};
+    u32 config_struct_offset{};
+    u32 filter_struct_offset{};
+    u32 palette_offset{};
+    u32 hist_offset{};
+    u32 context_id{};
+    u32 fce_ucode_size{};
+    PlaneOffsets output_surface{};
+    u32 fce_ucode_offset{};
+    INSERT_PADDING_WORDS(4);
+    std::array<u32, 8> slot_context_id{};
+    INSERT_PADDING_WORDS(16);
+};
+static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
+
+class Vic {
+public:
+    enum class Method : u32 {
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaVOffset = 0x1ca
+    };
+
+    explicit Vic(GPU& gpu, std::shared_ptr<Tegra::Nvdec> nvdec_processor);
+    ~Vic();
+
+    /// Write to the device state.
+    void ProcessMethod(Vic::Method method, const std::vector<u32>& arguments);
+
+private:
+    void Execute();
+
+    void VicStateWrite(u32 offset, u32 arguments);
+    VicRegisters vic_state{};
+
+    enum class VideoPixelFormat : u64_le {
+        RGBA8 = 0x1f,
+        BGRA8 = 0x20,
+        Yuv420 = 0x44,
+    };
+
+    union VicConfig {
+        u64_le raw{};
+        BitField<0, 7, u64_le> pixel_format;
+        BitField<7, 2, u64_le> chroma_loc_horiz;
+        BitField<9, 2, u64_le> chroma_loc_vert;
+        BitField<11, 4, u64_le> block_linear_kind;
+        BitField<15, 4, u64_le> block_linear_height_log2;
+        BitField<19, 3, u64_le> reserved0;
+        BitField<22, 10, u64_le> reserved1;
+        BitField<32, 14, u64_le> surface_width_minus1;
+        BitField<46, 14, u64_le> surface_height_minus1;
+    };
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_v_address{};
+
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
+};
+
+} // namespace Tegra
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 4bb9256e9..171f78183 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -27,9 +27,10 @@ namespace Tegra {
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 
-GPU::GPU(Core::System& system_, bool is_async_)
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
     : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
       dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
       maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
       fermi_2d{std::make_unique<Engines::Fermi2D>()},
       kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
     return *dma_pusher;
 }
 
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
 const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     // Synced GPU, is always in sync
     if (!is_async) {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 2d15d1c6f..b8c613b11 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
 
 using CacheAddr = std::uintptr_t;
@@ -157,7 +158,7 @@ public:
               method_count(method_count) {}
     };
 
-    explicit GPU(Core::System& system, bool is_async);
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
     virtual ~GPU();
 
     /// Binds a renderer to the GPU.
@@ -209,6 +210,15 @@ public:
     /// Returns a reference to the GPU DMA pusher.
     Tegra::DmaPusher& DmaPusher();
 
+    /// Returns a const reference to the GPU DMA pusher.
+    const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    const Tegra::CDmaPusher& CDmaPusher() const;
+
     VideoCore::RendererBase& Renderer() {
         return *renderer;
     }
@@ -249,8 +259,9 @@ public:
         return is_async;
     }
 
-    /// Returns a const reference to the GPU DMA pusher.
-    const Tegra::DmaPusher& DmaPusher() const;
+    bool UseNvdec() const {
+        return use_nvdec;
+    }
 
     struct Regs {
         static constexpr size_t NUM_REGS = 0x40;
@@ -311,6 +322,9 @@ public:
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
+    /// Push GPU command buffer entries to be processed
+    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;
+
     /// Swap buffers (render frame)
     virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
@@ -349,7 +363,9 @@ protected:
     Core::System& system;
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
     std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;
 
 private:
     /// Mapping of command subchannels to their bound engine ids
@@ -372,6 +388,7 @@ private:
     std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
 
     std::mutex sync_mutex;
+    std::mutex device_mutex;
 
     std::condition_variable sync_cv;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 70a3d5738..a9baaf7ef 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -10,12 +10,13 @@
 
 namespace VideoCommon {
 
-GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {}
+GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec)
+    : GPU{system, true, use_nvdec}, gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
 void GPUAsynch::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
     cpu_context = renderer->GetRenderWindow().CreateSharedContext();
     cpu_context->MakeCurrent();
 }
@@ -32,6 +33,27 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
 
+void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     gpu_thread.SwapBuffers(framebuffer);
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index f89c855a5..0c0872e73 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -20,13 +20,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch final : public Tegra::GPU {
 public:
-    explicit GPUAsynch(Core::System& system);
+    explicit GPUAsynch(Core::System& system, bool use_nvdec);
     ~GPUAsynch() override;
 
     void Start() override;
     void ObtainContext() override;
     void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 1ca47ddef..ecf7bbdf3 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -7,7 +7,7 @@
 
 namespace VideoCommon {
 
-GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {}
+GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}
 
 GPUSynch::~GPUSynch() = default;
 
@@ -26,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->DispatchCalls();
 }
 
+void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clears all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
 void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     renderer->SwapBuffers(framebuffer);
 }
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 297258cb1..9d778c71a 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -19,13 +19,14 @@ namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU synchronously
 class GPUSynch final : public Tegra::GPU {
 public:
-    explicit GPUSynch(Core::System& system);
+    explicit GPUSynch(Core::System& system, bool use_nvdec);
     ~GPUSynch() override;
 
     void Start() override;
     void ObtainContext() override;
     void ReleaseContext() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index bf761abf2..4b8f58283 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -18,7 +18,7 @@ namespace VideoCommon::GPUThread {
 /// Runs the GPU thread
 static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                       Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
     std::string name = "yuzu:GPU";
     MicroProfileOnThreadCreate(name.c_str());
     Common::SetCurrentThreadName(name.c_str());
@@ -42,6 +42,10 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
         if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
             dma_pusher.Push(std::move(submit_list->entries));
             dma_pusher.DispatchCalls();
+        } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
+            // NVDEC
+            cdma_pusher.Push(std::move(command_list->entries));
+            cdma_pusher.DispatchCalls();
         } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
             renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
         } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
@@ -75,15 +79,19 @@ ThreadManager::~ThreadManager() {
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
-                                Tegra::DmaPusher& dma_pusher) {
-    thread = std::thread{RunThread,         std::ref(system),     std::ref(renderer),
-                         std::ref(context), std::ref(dma_pusher), std::ref(state)};
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     PushCommand(SubmitListCommand(std::move(entries)));
 }
 
+void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
+    PushCommand(SubmitChCommandEntries(std::move(entries)));
+}
+
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 5a28335d6..32a34e3a7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -37,6 +37,14 @@ struct SubmitListCommand final {
     Tegra::CommandList entries;
 };
 
+/// Command to signal to the GPU thread that a cdma command list is ready for processing
+struct SubmitChCommandEntries final {
+    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries)
+        : entries{std::move(entries)} {}
+
+    Tegra::ChCommandHeaderList entries;
+};
+
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
     explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
@@ -77,9 +85,9 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};
 
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
-                 GPUTickCommand>;
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
 
 struct CommandDataContainer {
     CommandDataContainer() = default;
@@ -109,11 +117,14 @@ public:
 
     /// Creates and starts the GPU thread.
     void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
-                     Tegra::DmaPusher& dma_pusher);
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
 
     /// Push GPU command entries to be processed
     void SubmitList(Tegra::CommandList&& entries);
 
+    /// Push GPU CDMA command buffer entries to be processed
+    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
+
     /// Swap buffers (render frame)
     void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 02cf53d15..6e70bd362 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -11,6 +11,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra {
 
@@ -44,6 +45,12 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_
     return Map(cpu_addr, *FindFreeRange(size, align), size);
 }
 
+GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
+    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
+    ASSERT(gpu_addr);
+    return Map(cpu_addr, *gpu_addr, size);
+}
+
 void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
     if (!size) {
         return;
@@ -108,7 +115,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s
     page_table[PageEntryIndex(gpu_addr)] = page_entry;
 }
 
-std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const {
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
+                                                     bool start_32bit_address) const {
     if (!align) {
         align = page_size;
     } else {
@@ -116,7 +124,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size
     }
 
     u64 available_size{};
-    GPUVAddr gpu_addr{address_space_start};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
     while (gpu_addr + available_size < address_space_size) {
         if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
             available_size += page_size;
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 53c8d122a..c078193d9 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -116,6 +116,7 @@ public:
 
     [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
     [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
     [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
     [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
     void Unmap(GPUVAddr gpu_addr, std::size_t size);
@@ -124,7 +125,8 @@ private:
     [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
     void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
     GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
-    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const;
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
+                                                        bool start_32bit_address = false) const;
 
     void TryLockPage(PageEntry page_entry, std::size_t size);
     void TryUnlockPage(PageEntry page_entry, std::size_t size);
@@ -135,6 +137,7 @@ private:
 
     static constexpr u64 address_space_size = 1ULL << 40;
     static constexpr u64 address_space_start = 1ULL << 32;
+    static constexpr u64 address_space_start_low = 1ULL << 16;
     static constexpr u64 page_bits{16};
     static constexpr u64 page_size{1 << page_bits};
     static constexpr u64 page_mask{page_size - 1};
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index a14df06a3..dd5cee4a1 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -44,10 +44,11 @@ namespace VideoCore {
 
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
     std::unique_ptr<Tegra::GPU> gpu;
+    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
     if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
-        gpu = std::make_unique<VideoCommon::GPUAsynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);
     } else {
-        gpu = std::make_unique<VideoCommon::GPUSynch>(system);
+        gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);
     }
 
     auto context = emu_window.CreateSharedContext();
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index cc0291b15..4659e1f89 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -265,9 +265,11 @@ if (MSVC)
     include(CopyYuzuQt5Deps)
     include(CopyYuzuSDLDeps)
     include(CopyYuzuUnicornDeps)
+    include(CopyYuzuFFmpegDeps)
     copy_yuzu_Qt5_deps(yuzu)
     copy_yuzu_SDL_deps(yuzu)
     copy_yuzu_unicorn_deps(yuzu)
+    copy_yuzu_FFmpeg_deps(yuzu)
 endif()
 
 if (NOT APPLE)
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index d2913d613..abbc83929 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -717,6 +717,8 @@ void Config::ReadRendererValues() {
     ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0);
     ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,
                       QStringLiteral("use_asynchronous_gpu_emulation"), false);
+    ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"),
+                      true);
     ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
     ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
                       false);
@@ -1265,6 +1267,8 @@ void Config::SaveRendererValues() {
                        Settings::values.gpu_accuracy.UsingGlobal(), 0);
     WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),
                        Settings::values.use_asynchronous_gpu_emulation, false);
+    WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation,
+                       true);
     WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
     WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
                        Settings::values.use_assembly_shaders, false);
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index 07d818548..4f083ecda 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -70,9 +70,11 @@ void ConfigureGraphics::SetConfiguration() {
     ui->api->setEnabled(runtime_lock);
     ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);
     ui->use_disk_shader_cache->setEnabled(runtime_lock);
+    ui->use_nvdec_emulation->setEnabled(runtime_lock);
     ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());
     ui->use_asynchronous_gpu_emulation->setChecked(
         Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue());
 
     if (Settings::configuring_global) {
         ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue()));
@@ -116,6 +118,9 @@ void ConfigureGraphics::ApplyConfiguration() {
             Settings::values.use_asynchronous_gpu_emulation.SetValue(
                 ui->use_asynchronous_gpu_emulation->isChecked());
         }
+        if (Settings::values.use_nvdec_emulation.UsingGlobal()) {
+            Settings::values.use_nvdec_emulation.SetValue(ui->use_nvdec_emulation->isChecked());
+        }
         if (Settings::values.bg_red.UsingGlobal()) {
             Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));
             Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF()));
@@ -144,6 +149,8 @@ void ConfigureGraphics::ApplyConfiguration() {
         ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,
                                                  ui->use_asynchronous_gpu_emulation,
                                                  use_asynchronous_gpu_emulation);
+        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation,
+                                                 ui->use_nvdec_emulation, use_nvdec_emulation);
 
         if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {
             Settings::values.bg_red.SetGlobal(true);
@@ -240,6 +247,7 @@ void ConfigureGraphics::SetupPerGameUI() {
         ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());
         ui->use_asynchronous_gpu_emulation->setEnabled(
             Settings::values.use_asynchronous_gpu_emulation.UsingGlobal());
+        ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal());
         ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());
         ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal());
 
@@ -253,6 +261,8 @@ void ConfigureGraphics::SetupPerGameUI() {
 
     ConfigurationShared::SetColoredTristate(
         ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache);
+    ConfigurationShared::SetColoredTristate(
+        ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation);
     ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation,
                                             Settings::values.use_asynchronous_gpu_emulation,
                                             use_asynchronous_gpu_emulation);
diff --git a/src/yuzu/configuration/configure_graphics.h b/src/yuzu/configuration/configure_graphics.h
index b4961f719..1fefc88eb 100644
--- a/src/yuzu/configuration/configure_graphics.h
+++ b/src/yuzu/configuration/configure_graphics.h
@@ -46,6 +46,7 @@ private:
     std::unique_ptr<Ui::ConfigureGraphics> ui;
     QColor bg_color;
 
+    ConfigurationShared::CheckState use_nvdec_emulation;
     ConfigurationShared::CheckState use_disk_shader_cache;
     ConfigurationShared::CheckState use_asynchronous_gpu_emulation;
 
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index 62aa337e7..58486eb1e 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -97,6 +97,13 @@
           </property>
          </widget>
         </item>
+        <item>
+         <widget class="QCheckBox" name="use_nvdec_emulation">
+          <property name="text">
+           <string>Use NVDEC emulation</string>
+          </property>
+         </widget>
+        </item>
         <item>
          <widget class="QWidget" name="aspect_ratio_layout" native="true">
           <layout class="QHBoxLayout" name="horizontalLayout_6">