Implement Async downloads in normal and fix a few issues.
This commit is contained in:
parent
f2d3212de9
commit
ed4553806a
|
@ -22,6 +22,8 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
|
||||||
void(slot_buffers.insert(runtime, NullBufferParams{}));
|
void(slot_buffers.insert(runtime, NullBufferParams{}));
|
||||||
common_ranges.clear();
|
common_ranges.clear();
|
||||||
|
|
||||||
|
active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh();
|
||||||
|
|
||||||
if (!runtime.CanReportMemoryUsage()) {
|
if (!runtime.CanReportMemoryUsage()) {
|
||||||
minimum_memory = DEFAULT_EXPECTED_MEMORY;
|
minimum_memory = DEFAULT_EXPECTED_MEMORY;
|
||||||
critical_memory = DEFAULT_CRITICAL_MEMORY;
|
critical_memory = DEFAULT_CRITICAL_MEMORY;
|
||||||
|
@ -72,6 +74,8 @@ void BufferCache<P>::TickFrame() {
|
||||||
uniform_cache_hits[0] = 0;
|
uniform_cache_hits[0] = 0;
|
||||||
uniform_cache_shots[0] = 0;
|
uniform_cache_shots[0] = 0;
|
||||||
|
|
||||||
|
active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh();
|
||||||
|
|
||||||
const bool skip_preferred = hits * 256 < shots * 251;
|
const bool skip_preferred = hits * 256 < shots * 251;
|
||||||
uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
|
uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
|
||||||
|
|
||||||
|
@ -130,7 +134,7 @@ void BufferCache<P>::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) {
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
void BufferCache<P>::ClearDownload(IntervalType subtract_interval) {
|
void BufferCache<P>::ClearDownload(IntervalType subtract_interval) {
|
||||||
async_downloads -= std::make_pair(subtract_interval, std::numeric_limits<int>::max());
|
RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024);
|
||||||
uncommitted_ranges.subtract(subtract_interval);
|
uncommitted_ranges.subtract(subtract_interval);
|
||||||
pending_ranges.subtract(subtract_interval);
|
pending_ranges.subtract(subtract_interval);
|
||||||
for (auto& interval_set : committed_ranges) {
|
for (auto& interval_set : committed_ranges) {
|
||||||
|
@ -173,18 +177,14 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
|
||||||
}};
|
}};
|
||||||
|
|
||||||
boost::container::small_vector<IntervalType, 4> tmp_intervals;
|
boost::container::small_vector<IntervalType, 4> tmp_intervals;
|
||||||
const bool is_high_accuracy =
|
|
||||||
Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
|
|
||||||
auto mirror = [&](VAddr base_address, VAddr base_address_end) {
|
auto mirror = [&](VAddr base_address, VAddr base_address_end) {
|
||||||
const u64 size = base_address_end - base_address;
|
const u64 size = base_address_end - base_address;
|
||||||
const VAddr diff = base_address - *cpu_src_address;
|
const VAddr diff = base_address - *cpu_src_address;
|
||||||
const VAddr new_base_address = *cpu_dest_address + diff;
|
const VAddr new_base_address = *cpu_dest_address + diff;
|
||||||
const IntervalType add_interval{new_base_address, new_base_address + size};
|
const IntervalType add_interval{new_base_address, new_base_address + size};
|
||||||
tmp_intervals.push_back(add_interval);
|
tmp_intervals.push_back(add_interval);
|
||||||
if (is_high_accuracy) {
|
uncommitted_ranges.add(add_interval);
|
||||||
uncommitted_ranges.add(add_interval);
|
pending_ranges.add(add_interval);
|
||||||
pending_ranges.add(add_interval);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror);
|
ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror);
|
||||||
// This subtraction in this order is important for overlapping copies.
|
// This subtraction in this order is important for overlapping copies.
|
||||||
|
@ -468,7 +468,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
|
||||||
AccumulateFlushes();
|
AccumulateFlushes();
|
||||||
|
|
||||||
if (committed_ranges.empty()) {
|
if (committed_ranges.empty()) {
|
||||||
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
|
if (active_async_buffers) {
|
||||||
async_buffers.emplace_back(std::optional<Async_Buffer>{});
|
async_buffers.emplace_back(std::optional<Async_Buffer>{});
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
@ -529,31 +529,33 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
|
||||||
}
|
}
|
||||||
committed_ranges.clear();
|
committed_ranges.clear();
|
||||||
if (downloads.empty()) {
|
if (downloads.empty()) {
|
||||||
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
|
if (active_async_buffers) {
|
||||||
async_buffers.emplace_back(std::optional<Async_Buffer>{});
|
async_buffers.emplace_back(std::optional<Async_Buffer>{});
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
|
if (active_async_buffers) {
|
||||||
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true);
|
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
|
||||||
boost::container::small_vector<BufferCopy, 4> normalized_copies;
|
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true);
|
||||||
IntervalSet new_async_range{};
|
boost::container::small_vector<BufferCopy, 4> normalized_copies;
|
||||||
runtime.PreCopyBarrier();
|
IntervalSet new_async_range{};
|
||||||
for (auto& [copy, buffer_id] : downloads) {
|
runtime.PreCopyBarrier();
|
||||||
copy.dst_offset += download_staging.offset;
|
for (auto& [copy, buffer_id] : downloads) {
|
||||||
const std::array copies{copy};
|
copy.dst_offset += download_staging.offset;
|
||||||
BufferCopy second_copy{copy};
|
const std::array copies{copy};
|
||||||
Buffer& buffer = slot_buffers[buffer_id];
|
BufferCopy second_copy{copy};
|
||||||
second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset;
|
Buffer& buffer = slot_buffers[buffer_id];
|
||||||
VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset);
|
second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset;
|
||||||
const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size};
|
VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset);
|
||||||
async_downloads += std::make_pair(base_interval, 1);
|
const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size};
|
||||||
runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
|
async_downloads += std::make_pair(base_interval, 1);
|
||||||
normalized_copies.push_back(second_copy);
|
runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);
|
||||||
|
normalized_copies.push_back(second_copy);
|
||||||
|
}
|
||||||
|
runtime.PostCopyBarrier();
|
||||||
|
pending_downloads.emplace_back(std::move(normalized_copies));
|
||||||
|
async_buffers.emplace_back(download_staging);
|
||||||
}
|
}
|
||||||
runtime.PostCopyBarrier();
|
|
||||||
pending_downloads.emplace_back(std::move(normalized_copies));
|
|
||||||
async_buffers.emplace_back(download_staging);
|
|
||||||
} else {
|
} else {
|
||||||
if constexpr (USE_MEMORY_MAPS) {
|
if constexpr (USE_MEMORY_MAPS) {
|
||||||
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
||||||
|
@ -624,7 +626,8 @@ void BufferCache<P>::PopAsyncBuffers() {
|
||||||
common_ranges.subtract(base_interval);
|
common_ranges.subtract(base_interval);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1);
|
const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size};
|
||||||
|
RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1);
|
||||||
}
|
}
|
||||||
runtime.FreeDeferredStagingBuffer(*async_buffer);
|
runtime.FreeDeferredStagingBuffer(*async_buffer);
|
||||||
async_buffers.pop_front();
|
async_buffers.pop_front();
|
||||||
|
@ -1198,10 +1201,8 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
|
||||||
|
|
||||||
const IntervalType base_interval{cpu_addr, cpu_addr + size};
|
const IntervalType base_interval{cpu_addr, cpu_addr + size};
|
||||||
common_ranges.add(base_interval);
|
common_ranges.add(base_interval);
|
||||||
if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
|
uncommitted_ranges.add(base_interval);
|
||||||
uncommitted_ranges.add(base_interval);
|
pending_ranges.add(base_interval);
|
||||||
pending_ranges.add(base_interval);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
|
@ -1542,7 +1543,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
|
||||||
.size = new_size,
|
.size = new_size,
|
||||||
});
|
});
|
||||||
// Align up to avoid cache conflicts
|
// Align up to avoid cache conflicts
|
||||||
constexpr u64 align = 8ULL;
|
constexpr u64 align = 64ULL;
|
||||||
constexpr u64 mask = ~(align - 1ULL);
|
constexpr u64 mask = ~(align - 1ULL);
|
||||||
total_size_bytes += (new_size + align - 1) & mask;
|
total_size_bytes += (new_size + align - 1) & mask;
|
||||||
largest_copy = std::max(largest_copy, new_size);
|
largest_copy = std::max(largest_copy, new_size);
|
||||||
|
|
|
@ -345,13 +345,30 @@ private:
|
||||||
if (inter_addr < start_address) {
|
if (inter_addr < start_address) {
|
||||||
inter_addr = start_address;
|
inter_addr = start_address;
|
||||||
}
|
}
|
||||||
if (it->second <= 0) {
|
|
||||||
__debugbreak();
|
|
||||||
}
|
|
||||||
func(inter_addr, inter_addr_end, it->second);
|
func(inter_addr, inter_addr_end, it->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void RemoveEachInOverlapCounter(OverlapCounter& current_range, const IntervalType search_interval, int subtract_value) {
|
||||||
|
bool any_removals = false;
|
||||||
|
current_range.add(std::make_pair(search_interval, subtract_value));
|
||||||
|
do {
|
||||||
|
any_removals = false;
|
||||||
|
auto it = current_range.lower_bound(search_interval);
|
||||||
|
if (it == current_range.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto end_it = current_range.upper_bound(search_interval);
|
||||||
|
for (; it != end_it; it++) {
|
||||||
|
if (it->second <= 0) {
|
||||||
|
any_removals = true;
|
||||||
|
current_range.erase(it);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (any_removals);
|
||||||
|
}
|
||||||
|
|
||||||
static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
|
static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
|
||||||
return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==
|
return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==
|
||||||
((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
|
((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK);
|
||||||
|
@ -554,6 +571,8 @@ private:
|
||||||
u64 minimum_memory = 0;
|
u64 minimum_memory = 0;
|
||||||
u64 critical_memory = 0;
|
u64 critical_memory = 0;
|
||||||
|
|
||||||
|
bool active_async_buffers = false;
|
||||||
|
|
||||||
std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
|
std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -273,7 +273,7 @@ public:
|
||||||
untracked_words[word_index] &= ~bits;
|
untracked_words[word_index] &= ~bits;
|
||||||
NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
|
NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
|
||||||
}
|
}
|
||||||
const u64 word = current_word;
|
const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
|
||||||
u64 page = page_begin;
|
u64 page = page_begin;
|
||||||
page_begin = 0;
|
page_begin = 0;
|
||||||
|
|
||||||
|
@ -321,6 +321,7 @@ public:
|
||||||
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
|
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
|
||||||
static_assert(type != Type::Untracked);
|
static_assert(type != Type::Untracked);
|
||||||
|
|
||||||
|
const u64* const untracked_words = Array<Type::Untracked>();
|
||||||
const u64* const state_words = Array<type>();
|
const u64* const state_words = Array<type>();
|
||||||
const u64 num_query_words = size / BYTES_PER_WORD + 1;
|
const u64 num_query_words = size / BYTES_PER_WORD + 1;
|
||||||
const u64 word_begin = offset / BYTES_PER_WORD;
|
const u64 word_begin = offset / BYTES_PER_WORD;
|
||||||
|
@ -328,7 +329,8 @@ public:
|
||||||
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
|
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
|
||||||
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
|
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
|
||||||
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
|
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
|
||||||
const u64 word = state_words[word_index];
|
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
|
||||||
|
const u64 word = state_words[word_index] & ~off_word;
|
||||||
if (word == 0) {
|
if (word == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
Reference in New Issue