ffmpeg: Correctly handle sample rates

Previously, we just used the native sample rate for encoding. However, some encoders like libmp3lame doesn't support it. Therefore, we now use a supported sample rate (preferring the native one if possible). FFmpeg requires audio data to be sent in a sequence of frames, each containing the same specific number of samples. Previously, we buffered input samples in FFmpegBackend. However, as the source and destination sample rates can now be different, we should buffer resampled data instead. swresample have an internal input buffer, so we now just forward all data to it and 'gradually' receive resampled data, at most one frame_size at a time. When there is not enough resampled data to form a frame, we will record the current offset and request for less data on the next call. Additionally, this commit also fixes a flaw. When an encoder supports variable frame sizes, its frame size is reported to be 0, which breaks our buffering system. Now we treat variable frame size encoders as having a frame size of 160 (the size of a HLE audio frame).
2020-02-01 12:23:07 +08:00 · 2020-02-01 12:23:07 +08:00 · 4161163d9c
parent 8b9c01ded9
commit 4161163d9c
2 changed files with 84 additions and 64 deletions
--- a/src/core/dumping/ffmpeg_backend.cpp
+++ b/src/core/dumping/ffmpeg_backend.cpp
@ -211,7 +211,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
    if (!FFmpegStream::Init(format_context))
        return false;
-    sample_count = 0;
+    frame_count = 0;
    // Initialize audio codec
    const AVCodec* codec = avcodec_find_encoder_by_name(Settings::values.audio_encoder.c_str());
@ -243,7 +243,20 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
        codec_context->sample_fmt = AV_SAMPLE_FMT_S16P;
    }
    if (codec->supported_samplerates) {
        codec_context->sample_rate = codec->supported_samplerates[0];
        // Prefer native sample rate if supported
        const int* ptr = codec->supported_samplerates;
        while ((*ptr)) {
            if ((*ptr) == AudioCore::native_sample_rate) {
                codec_context->sample_rate = AudioCore::native_sample_rate;
                break;
            }
            ptr++;
        }
    } else {
        codec_context->sample_rate = AudioCore::native_sample_rate;
    }
    codec_context->channel_layout = AV_CH_LAYOUT_STEREO;
    codec_context->channels = 2;
@ -259,6 +272,12 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
        LOG_WARNING(Render, "Audio encoder options not found: {}", buf);
    }
    if (codec_context->frame_size) {
        frame_size = static_cast<u64>(codec_context->frame_size);
    } else { // variable frame size support
        frame_size = std::tuple_size<AudioCore::StereoFrame16>::value;
    }
    // Create audio stream
    stream = avformat_new_stream(format_context, codec);
    if (!stream || avcodec_parameters_from_context(stream->codecpar, codec_context.get()) < 0) {
@ -291,7 +310,7 @@ bool FFmpegAudioStream::Init(AVFormatContext* format_context) {
    // Allocate resampled data
    int error =
        av_samples_alloc_array_and_samples(&resampled_data, nullptr, codec_context->channels,
-                                           codec_context->frame_size, codec_context->sample_fmt, 0);
+                                           frame_size, codec_context->sample_fmt, 0);
    if (error < 0) {
        LOG_ERROR(Render, "Could not allocate samples storage");
        return false;
@ -312,31 +331,62 @@ void FFmpegAudioStream::Free() {
    av_freep(&resampled_data);
 }
-void FFmpegAudioStream::ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) {
+void FFmpegAudioStream::ProcessFrame(const VariableAudioFrame& channel0,
                                     const VariableAudioFrame& channel1) {
    ASSERT_MSG(channel0.size() == channel1.size(),
               "Frames of the two channels must have the same number of samples");
    std::array<const u8*, 2> src_data = {reinterpret_cast<u8*>(channel0.data()),
                                         reinterpret_cast<u8*>(channel1.data())};
    if (swr_convert(swr_context.get(), resampled_data, channel0.size(), src_data.data(),
                    channel0.size()) < 0) {
    const auto sample_size = av_get_bytes_per_sample(codec_context->sample_fmt);
    std::array<const u8*, 2> src_data = {reinterpret_cast<const u8*>(channel0.data()),
                                         reinterpret_cast<const u8*>(channel1.data())};
    std::array<u8*, 2> dst_data = {resampled_data[0] + sample_size * offset,
                                   resampled_data[1] + sample_size * offset};
    auto resampled_count = swr_convert(swr_context.get(), dst_data.data(), frame_size - offset,
                                       src_data.data(), channel0.size());
    if (resampled_count < 0) {
        LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
        return;
    }
-    // Prepare frame
+    offset += resampled_count;
-    audio_frame->nb_samples = channel0.size();
+    if (offset < frame_size) { // Still not enough to form a frame
-    audio_frame->data[0] = resampled_data[0];
+        return;
    audio_frame->data[1] = resampled_data[1];
    audio_frame->pts = sample_count;
    sample_count += channel0.size();
    SendFrame(audio_frame.get());
    }
-std::size_t FFmpegAudioStream::GetAudioFrameSize() const {
+    while (true) {
-    ASSERT_MSG(codec_context, "Codec context is not initialized yet!");
+        // Prepare frame
-    return codec_context->frame_size;
+        audio_frame->nb_samples = frame_size;
        audio_frame->data[0] = resampled_data[0];
        audio_frame->data[1] = resampled_data[1];
        audio_frame->pts = frame_count * frame_size;
        frame_count++;
        SendFrame(audio_frame.get());
        // swr_convert buffers input internally. Try to get more resampled data
        resampled_count = swr_convert(swr_context.get(), resampled_data, frame_size, nullptr, 0);
        if (resampled_count < 0) {
            LOG_ERROR(Render, "Audio frame dropped: Could not resample data");
            return;
        }
        if (static_cast<u64>(resampled_count) < frame_size) {
            offset = resampled_count;
            break;
        }
    }
 }
 void FFmpegAudioStream::Flush() {
    // Send the last samples
    audio_frame->nb_samples = offset;
    audio_frame->data[0] = resampled_data[0];
    audio_frame->data[1] = resampled_data[1];
    audio_frame->pts = frame_count * frame_size;
    SendFrame(audio_frame.get());
    FFmpegStream::Flush();
 }
 FFmpegMuxer::~FFmpegMuxer() {
@ -402,7 +452,8 @@ void FFmpegMuxer::ProcessVideoFrame(VideoFrame& frame) {
    video_stream.ProcessFrame(frame);
 }
-void FFmpegMuxer::ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1) {
+void FFmpegMuxer::ProcessAudioFrame(const VariableAudioFrame& channel0,
                                    const VariableAudioFrame& channel1) {
    audio_stream.ProcessFrame(channel0, channel1);
 }
@ -414,10 +465,6 @@ void FFmpegMuxer::FlushAudio() {
    audio_stream.Flush();
 }
 std::size_t FFmpegMuxer::GetAudioFrameSize() const {
    return audio_stream.GetAudioFrameSize();
 }
 void FFmpegMuxer::WriteTrailer() {
    av_write_trailer(format_context.get());
 }
@ -498,24 +545,20 @@ void FFmpegBackend::AddVideoFrame(VideoFrame frame) {
 }
 void FFmpegBackend::AddAudioFrame(AudioCore::StereoFrame16 frame) {
-    std::array<std::array<s16, 160>, 2> refactored_frame;
+    std::array<VariableAudioFrame, 2> refactored_frame;
    for (auto& channel : refactored_frame) {
        channel.resize(frame.size());
    }
    for (std::size_t i = 0; i < frame.size(); i++) {
        refactored_frame[0][i] = frame[i][0];
        refactored_frame[1][i] = frame[i][1];
    }
-    for (auto i : {0, 1}) {
+    ffmpeg.ProcessAudioFrame(refactored_frame[0], refactored_frame[1]);
        audio_buffers[i].insert(audio_buffers[i].end(), refactored_frame[i].begin(),
                                refactored_frame[i].end());
    }
    CheckAudioBuffer();
 }
 void FFmpegBackend::AddAudioSample(const std::array<s16, 2>& sample) {
-    for (auto i : {0, 1}) {
+    ffmpeg.ProcessAudioFrame({sample[0]}, {sample[1]});
        audio_buffers[i].push_back(sample[i]);
    }
    CheckAudioBuffer();
 }
 void FFmpegBackend::StopDumping() {
@ -525,12 +568,6 @@ void FFmpegBackend::StopDumping() {
    // Flush the video processing queue
    AddVideoFrame(VideoFrame());
    for (auto i : {0, 1}) {
        // Add remaining data to audio queue
        if (audio_buffers[i].size() >= 0) {
            VariableAudioFrame buffer(audio_buffers[i].begin(), audio_buffers[i].end());
            audio_frame_queues[i].Push(std::move(buffer));
            audio_buffers[i].clear();
        }
        // Flush the audio processing queue
        audio_frame_queues[i].Push(VariableAudioFrame());
    }
@ -554,18 +591,4 @@ void FFmpegBackend::EndDumping() {
    processing_ended.Set();
 }
 void FFmpegBackend::CheckAudioBuffer() {
    for (auto i : {0, 1}) {
        const std::size_t frame_size = ffmpeg.GetAudioFrameSize();
        // Add audio data to the queue when there is enough to form a frame
        while (audio_buffers[i].size() >= frame_size) {
            VariableAudioFrame buffer(audio_buffers[i].begin(),
                                      audio_buffers[i].begin() + frame_size);
            audio_frame_queues[i].Push(std::move(buffer));
            audio_buffers[i].erase(audio_buffers[i].begin(), audio_buffers[i].begin() + frame_size);
        }
    }
 }
 } // namespace VideoDumper
--- a/src/core/dumping/ffmpeg_backend.h
+++ b/src/core/dumping/ffmpeg_backend.h
@ -96,6 +96,7 @@ private:
 /**
 * A FFmpegStream used for audio data.
 * Resamples (converts), encodes and writes a frame.
 * This also temporarily stores resampled audio data before there are enough to form a frame.
 */
 class FFmpegAudioStream : public FFmpegStream {
 public:
@ -103,8 +104,8 @@ public:
    bool Init(AVFormatContext* format_context);
    void Free();
-    void ProcessFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1);
+    void ProcessFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
-    std::size_t GetAudioFrameSize() const;
+    void Flush();
 private:
    struct SwrContextDeleter {
@ -113,12 +114,14 @@ private:
        }
    };
-    u64 sample_count{};
+    u64 frame_size{};
    u64 frame_count{};
    std::unique_ptr<AVFrame, AVFrameDeleter> audio_frame{};
    std::unique_ptr<SwrContext, SwrContextDeleter> swr_context{};
    u8** resampled_data{};
    u64 offset{}; // Number of output samples that are currently in resampled_data.
 };
 /**
@ -132,10 +135,9 @@ public:
    bool Init(const std::string& path, const Layout::FramebufferLayout& layout);
    void Free();
    void ProcessVideoFrame(VideoFrame& frame);
-    void ProcessAudioFrame(VariableAudioFrame& channel0, VariableAudioFrame& channel1);
+    void ProcessAudioFrame(const VariableAudioFrame& channel0, const VariableAudioFrame& channel1);
    void FlushVideo();
    void FlushAudio();
    std::size_t GetAudioFrameSize() const;
    void WriteTrailer();
 private:
@ -153,8 +155,7 @@ private:
 /**
 * FFmpeg video dumping backend.
- * This class implements a double buffer, and an audio queue to keep audio data
+ * This class implements a double buffer.
 * before enough data is received to form a frame.
 */
 class FFmpegBackend : public Backend {
 public:
@ -169,7 +170,6 @@ public:
    Layout::FramebufferLayout GetLayout() const override;
 private:
    void CheckAudioBuffer();
    void EndDumping();
    std::atomic_bool is_dumping = false; ///< Whether the backend is currently dumping
@ -182,9 +182,6 @@ private:
    Common::Event event1, event2;
    std::thread video_processing_thread;
    /// An audio buffer used to temporarily hold audio data, before the size is big enough
    /// to be sent to the encoder as a frame
    std::array<VariableAudioFrame, 2> audio_buffers;
    std::array<Common::SPSCQueue<VariableAudioFrame>, 2> audio_frame_queues;
    std::thread audio_processing_thread;