time_stretch: Simplify audio stretcher
This commit is contained in:
parent
f34711219a
commit
eed55a813e
|
@ -15,6 +15,7 @@ DspInterface::DspInterface() = default;
|
||||||
DspInterface::~DspInterface() = default;
|
DspInterface::~DspInterface() = default;
|
||||||
|
|
||||||
void DspInterface::SetSink(const std::string& sink_id, const std::string& audio_device) {
|
void DspInterface::SetSink(const std::string& sink_id, const std::string& audio_device) {
|
||||||
|
sink.reset();
|
||||||
const SinkDetails& sink_details = GetSinkDetails(sink_id);
|
const SinkDetails& sink_details = GetSinkDetails(sink_id);
|
||||||
sink = sink_details.factory(audio_device);
|
sink = sink_details.factory(audio_device);
|
||||||
sink->SetCallback(
|
sink->SetCallback(
|
||||||
|
@ -32,7 +33,7 @@ void DspInterface::EnableStretching(bool enable) {
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!enable) {
|
if (!enable) {
|
||||||
FlushResidualStretcherAudio();
|
flushing_time_stretcher = true;
|
||||||
}
|
}
|
||||||
perform_time_stretching = enable;
|
perform_time_stretching = enable;
|
||||||
}
|
}
|
||||||
|
@ -51,17 +52,27 @@ void DspInterface::OutputFrame(StereoFrame16& frame) {
|
||||||
fifo.Push(frame.data(), frame.size());
|
fifo.Push(frame.data(), frame.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void DspInterface::FlushResidualStretcherAudio() {}
|
void DspInterface::OutputCallback(s16* buffer, std::size_t num_frames) {
|
||||||
|
std::size_t frames_written;
|
||||||
void DspInterface::OutputCallback(s16* buffer, size_t num_frames) {
|
if (perform_time_stretching) {
|
||||||
const size_t frames_written = fifo.Pop(buffer, num_frames);
|
const std::vector<s16> in{fifo.Pop()};
|
||||||
|
const std::size_t num_in{in.size() / 2};
|
||||||
|
frames_written = time_stretcher.Process(in.data(), num_in, buffer, num_frames);
|
||||||
|
} else if (flushing_time_stretcher) {
|
||||||
|
time_stretcher.Flush();
|
||||||
|
frames_written = time_stretcher.Process(nullptr, 0, buffer, num_frames);
|
||||||
|
frames_written += fifo.Pop(buffer, num_frames - frames_written);
|
||||||
|
flushing_time_stretcher = false;
|
||||||
|
} else {
|
||||||
|
frames_written = fifo.Pop(buffer, num_frames);
|
||||||
|
}
|
||||||
|
|
||||||
if (frames_written > 0) {
|
if (frames_written > 0) {
|
||||||
std::memcpy(&last_frame[0], buffer + 2 * (frames_written - 1), 2 * sizeof(s16));
|
std::memcpy(&last_frame[0], buffer + 2 * (frames_written - 1), 2 * sizeof(s16));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hold last emitted frame; this prevents popping.
|
// Hold last emitted frame; this prevents popping.
|
||||||
for (size_t i = frames_written; i < num_frames; i++) {
|
for (std::size_t i = frames_written; i < num_frames; i++) {
|
||||||
std::memcpy(buffer + 2 * i, &last_frame[0], 2 * sizeof(s16));
|
std::memcpy(buffer + 2 * i, &last_frame[0], 2 * sizeof(s16));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,7 +85,8 @@ private:
|
||||||
void OutputCallback(s16* buffer, std::size_t num_frames);
|
void OutputCallback(s16* buffer, std::size_t num_frames);
|
||||||
|
|
||||||
std::unique_ptr<Sink> sink;
|
std::unique_ptr<Sink> sink;
|
||||||
bool perform_time_stretching = false;
|
std::atomic<bool> perform_time_stretching = false;
|
||||||
|
std::atomic<bool> flushing_time_stretcher = false;
|
||||||
Common::RingBuffer<s16, 0x2000, 2> fifo;
|
Common::RingBuffer<s16, 0x2000, 2> fifo;
|
||||||
std::array<s16, 2> last_frame{};
|
std::array<s16, 2> last_frame{};
|
||||||
TimeStretcher time_stretcher;
|
TimeStretcher time_stretcher;
|
||||||
|
|
|
@ -3,143 +3,75 @@
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <chrono>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <vector>
|
#include <cstddef>
|
||||||
|
#include <memory>
|
||||||
#include <SoundTouch.h>
|
#include <SoundTouch.h>
|
||||||
#include "audio_core/audio_types.h"
|
#include "audio_core/audio_types.h"
|
||||||
#include "audio_core/time_stretch.h"
|
#include "audio_core/time_stretch.h"
|
||||||
#include "common/common_types.h"
|
|
||||||
#include "common/logging/log.h"
|
#include "common/logging/log.h"
|
||||||
|
|
||||||
using steady_clock = std::chrono::steady_clock;
|
|
||||||
|
|
||||||
namespace AudioCore {
|
namespace AudioCore {
|
||||||
|
|
||||||
constexpr double MIN_RATIO = 0.1;
|
TimeStretcher::TimeStretcher()
|
||||||
constexpr double MAX_RATIO = 100.0;
|
: sample_rate(native_sample_rate), sound_touch(std::make_unique<soundtouch::SoundTouch>()) {
|
||||||
|
sound_touch->setChannels(2);
|
||||||
static double ClampRatio(double ratio) {
|
sound_touch->setSampleRate(native_sample_rate);
|
||||||
return std::clamp(ratio, MIN_RATIO, MAX_RATIO);
|
sound_touch->setPitch(1.0);
|
||||||
|
sound_touch->setTempo(1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr double MIN_DELAY_TIME = 0.05; // Units: seconds
|
TimeStretcher::~TimeStretcher() = default;
|
||||||
constexpr double MAX_DELAY_TIME = 0.25; // Units: seconds
|
|
||||||
constexpr std::size_t DROP_FRAMES_SAMPLE_DELAY = 16000; // Units: samples
|
|
||||||
|
|
||||||
constexpr double SMOOTHING_FACTOR = 0.007;
|
|
||||||
|
|
||||||
struct TimeStretcher::Impl {
|
|
||||||
soundtouch::SoundTouch soundtouch;
|
|
||||||
|
|
||||||
steady_clock::time_point frame_timer = steady_clock::now();
|
|
||||||
std::size_t samples_queued = 0;
|
|
||||||
|
|
||||||
double smoothed_ratio = 1.0;
|
|
||||||
|
|
||||||
double sample_rate = static_cast<double>(native_sample_rate);
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<s16> TimeStretcher::Process(std::size_t samples_in_queue) {
|
|
||||||
// This is a very simple algorithm without any fancy control theory. It works and is stable.
|
|
||||||
|
|
||||||
double ratio = CalculateCurrentRatio();
|
|
||||||
ratio = CorrectForUnderAndOverflow(ratio, samples_in_queue);
|
|
||||||
impl->smoothed_ratio =
|
|
||||||
(1.0 - SMOOTHING_FACTOR) * impl->smoothed_ratio + SMOOTHING_FACTOR * ratio;
|
|
||||||
impl->smoothed_ratio = ClampRatio(impl->smoothed_ratio);
|
|
||||||
|
|
||||||
// SoundTouch's tempo definition the inverse of our ratio definition.
|
|
||||||
impl->soundtouch.setTempo(1.0 / impl->smoothed_ratio);
|
|
||||||
|
|
||||||
std::vector<s16> samples = GetSamples();
|
|
||||||
if (samples_in_queue >= DROP_FRAMES_SAMPLE_DELAY) {
|
|
||||||
samples.clear();
|
|
||||||
LOG_DEBUG(Audio, "Dropping frames!");
|
|
||||||
}
|
|
||||||
return samples;
|
|
||||||
}
|
|
||||||
|
|
||||||
TimeStretcher::TimeStretcher() : impl(std::make_unique<Impl>()) {
|
|
||||||
impl->soundtouch.setPitch(1.0);
|
|
||||||
impl->soundtouch.setChannels(2);
|
|
||||||
impl->soundtouch.setSampleRate(native_sample_rate);
|
|
||||||
Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
TimeStretcher::~TimeStretcher() {
|
|
||||||
impl->soundtouch.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) {
|
void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) {
|
||||||
impl->sample_rate = static_cast<double>(sample_rate);
|
sound_touch->setSampleRate(sample_rate);
|
||||||
impl->soundtouch.setRate(static_cast<double>(native_sample_rate) / impl->sample_rate);
|
sample_rate = native_sample_rate;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TimeStretcher::AddSamples(const s16* buffer, std::size_t num_samples) {
|
std::size_t TimeStretcher::Process(const s16* in, std::size_t num_in, s16* out,
|
||||||
impl->soundtouch.putSamples(buffer, static_cast<uint>(num_samples));
|
std::size_t num_out) {
|
||||||
impl->samples_queued += num_samples;
|
const double time_delta = static_cast<double>(num_out) / sample_rate; // seconds
|
||||||
|
double current_ratio = static_cast<double>(num_in) / static_cast<double>(num_out);
|
||||||
|
|
||||||
|
const double max_latency = 0.25; // seconds
|
||||||
|
const double max_backlog = sample_rate * max_latency;
|
||||||
|
const double backlog_fullness = sound_touch->numSamples() / max_backlog;
|
||||||
|
if (backlog_fullness > 4.0) {
|
||||||
|
// Too many samples in backlog: Don't push anymore on
|
||||||
|
num_in = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We ideally want the backlog to be about 50% full.
|
||||||
|
// This gives some headroom both ways to prevent underflow and overflow.
|
||||||
|
// We tweak current_ratio to encourage this.
|
||||||
|
constexpr double tweak_time_scale = 0.050; // seconds
|
||||||
|
const double tweak_correction = (backlog_fullness - 0.5) * (time_delta / tweak_time_scale);
|
||||||
|
current_ratio *= std::pow(1.0 + 2.0 * tweak_correction, tweak_correction < 0 ? 3.0 : 1.0);
|
||||||
|
|
||||||
|
// This low-pass filter smoothes out variance in the calculated stretch ratio.
|
||||||
|
// The time-scale determines how responsive this filter is.
|
||||||
|
constexpr double lpf_time_scale = 0.712; // seconds
|
||||||
|
const double lpf_gain = 1.0 - std::exp(-time_delta / lpf_time_scale);
|
||||||
|
stretch_ratio += lpf_gain * (current_ratio - stretch_ratio);
|
||||||
|
|
||||||
|
// Place a lower limit of 5% speed. When a game boots up, there will be
|
||||||
|
// many silence samples. These do not need to be timestretched.
|
||||||
|
stretch_ratio = std::max(stretch_ratio, 0.05);
|
||||||
|
sound_touch->setTempo(stretch_ratio);
|
||||||
|
|
||||||
|
LOG_DEBUG(Audio, "{:5}/{:5} ratio:{:0.6f} backlog:{:0.6f}", num_in, num_out, stretch_ratio,
|
||||||
|
backlog_fullness);
|
||||||
|
|
||||||
|
sound_touch->putSamples(in, num_in);
|
||||||
|
return sound_touch->receiveSamples(out, num_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TimeStretcher::Clear() {
|
||||||
|
sound_touch->clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void TimeStretcher::Flush() {
|
void TimeStretcher::Flush() {
|
||||||
impl->soundtouch.flush();
|
sound_touch->flush();
|
||||||
}
|
|
||||||
|
|
||||||
void TimeStretcher::Reset() {
|
|
||||||
impl->soundtouch.setTempo(1.0);
|
|
||||||
impl->soundtouch.clear();
|
|
||||||
impl->smoothed_ratio = 1.0;
|
|
||||||
impl->frame_timer = steady_clock::now();
|
|
||||||
impl->samples_queued = 0;
|
|
||||||
SetOutputSampleRate(native_sample_rate);
|
|
||||||
}
|
|
||||||
|
|
||||||
double TimeStretcher::CalculateCurrentRatio() {
|
|
||||||
const steady_clock::time_point now = steady_clock::now();
|
|
||||||
const std::chrono::duration<double> duration = now - impl->frame_timer;
|
|
||||||
|
|
||||||
const double expected_time =
|
|
||||||
static_cast<double>(impl->samples_queued) / static_cast<double>(native_sample_rate);
|
|
||||||
const double actual_time = duration.count();
|
|
||||||
|
|
||||||
double ratio;
|
|
||||||
if (expected_time != 0) {
|
|
||||||
ratio = ClampRatio(actual_time / expected_time);
|
|
||||||
} else {
|
|
||||||
ratio = impl->smoothed_ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl->frame_timer = now;
|
|
||||||
impl->samples_queued = 0;
|
|
||||||
|
|
||||||
return ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
double TimeStretcher::CorrectForUnderAndOverflow(double ratio, std::size_t sample_delay) const {
|
|
||||||
const std::size_t min_sample_delay =
|
|
||||||
static_cast<std::size_t>(MIN_DELAY_TIME * impl->sample_rate);
|
|
||||||
const std::size_t max_sample_delay =
|
|
||||||
static_cast<std::size_t>(MAX_DELAY_TIME * impl->sample_rate);
|
|
||||||
|
|
||||||
if (sample_delay < min_sample_delay) {
|
|
||||||
// Make the ratio bigger.
|
|
||||||
ratio = ratio > 1.0 ? ratio * ratio : sqrt(ratio);
|
|
||||||
} else if (sample_delay > max_sample_delay) {
|
|
||||||
// Make the ratio smaller.
|
|
||||||
ratio = ratio > 1.0 ? sqrt(ratio) : ratio * ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ClampRatio(ratio);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<s16> TimeStretcher::GetSamples() {
|
|
||||||
uint available = impl->soundtouch.numSamples();
|
|
||||||
|
|
||||||
std::vector<s16> output(static_cast<std::size_t>(available) * 2);
|
|
||||||
|
|
||||||
impl->soundtouch.receiveSamples(output.data(), available);
|
|
||||||
|
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace AudioCore
|
} // namespace AudioCore
|
||||||
|
|
|
@ -4,57 +4,39 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <array>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
|
||||||
|
namespace soundtouch {
|
||||||
|
class SoundTouch;
|
||||||
|
}
|
||||||
|
|
||||||
namespace AudioCore {
|
namespace AudioCore {
|
||||||
|
|
||||||
class TimeStretcher final {
|
class TimeStretcher {
|
||||||
public:
|
public:
|
||||||
TimeStretcher();
|
TimeStretcher();
|
||||||
~TimeStretcher();
|
~TimeStretcher();
|
||||||
|
|
||||||
/**
|
|
||||||
* Set sample rate for the samples that Process returns.
|
|
||||||
* @param sample_rate The sample rate.
|
|
||||||
*/
|
|
||||||
void SetOutputSampleRate(unsigned int sample_rate);
|
void SetOutputSampleRate(unsigned int sample_rate);
|
||||||
|
|
||||||
/**
|
/// @param in Input sample buffer
|
||||||
* Add samples to be processed.
|
/// @param num_in Number of input frames in `in`
|
||||||
* @param sample_buffer Buffer of samples in interleaved stereo PCM16 format.
|
/// @param out Output sample buffer
|
||||||
* @param num_samples Number of samples.
|
/// @param num_out Desired number of output frames in `out`
|
||||||
*/
|
/// @returns Actual number of frames written to `out`
|
||||||
void AddSamples(const s16* sample_buffer, std::size_t num_samples);
|
std::size_t Process(const s16* in, std::size_t num_in, s16* out, std::size_t num_out);
|
||||||
|
|
||||||
|
void Clear();
|
||||||
|
|
||||||
/// Flush audio remaining in internal buffers.
|
|
||||||
void Flush();
|
void Flush();
|
||||||
|
|
||||||
/// Resets internal state and clears buffers.
|
|
||||||
void Reset();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Does audio stretching and produces the time-stretched samples.
|
|
||||||
* Timer calculations use sample_delay to determine how much of a margin we have.
|
|
||||||
* @param sample_delay How many samples are buffered downstream of this module and haven't been
|
|
||||||
* played yet.
|
|
||||||
* @return Samples to play in interleaved stereo PCM16 format.
|
|
||||||
*/
|
|
||||||
std::vector<s16> Process(std::size_t sample_delay);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Impl;
|
unsigned int sample_rate;
|
||||||
std::unique_ptr<Impl> impl;
|
std::unique_ptr<soundtouch::SoundTouch> sound_touch;
|
||||||
|
double stretch_ratio = 1.0;
|
||||||
/// INTERNAL: ratio = wallclock time / emulated time
|
|
||||||
double CalculateCurrentRatio();
|
|
||||||
/// INTERNAL: If we have too many or too few samples downstream, nudge ratio in the appropriate
|
|
||||||
/// direction.
|
|
||||||
double CorrectForUnderAndOverflow(double ratio, std::size_t sample_delay) const;
|
|
||||||
/// INTERNAL: Gets the time-stretched samples from SoundTouch.
|
|
||||||
std::vector<s16> GetSamples();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace AudioCore
|
} // namespace AudioCore
|
||||||
|
|
Reference in New Issue