From aa2aa1c0157ef84ba359a5c55f3c227465768e0d Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Tue, 24 Jun 2025 23:46:44 -0700 Subject: [PATCH 01/16] encode/store audio as part of video file --- system/loggerd/loggerd.cc | 26 +++++++- system/loggerd/loggerd.h | 2 + system/loggerd/video_writer.cc | 107 ++++++++++++++++++++++++++++++++- system/loggerd/video_writer.h | 15 ++++- 4 files changed, 146 insertions(+), 4 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 898216e5b6f6e0..348e1ec78bf99b 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -83,7 +83,8 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc assert(encoder_info.filename != NULL); re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(), encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C, - edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType())); + edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType(), + encoder_info.include_audio)); // write the header auto header = edata.getHeader(); re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false); @@ -214,7 +215,7 @@ void loggerd_thread() { typedef struct ServiceState { std::string name; int counter, freq; - bool encoder, user_flag; + bool encoder, user_flag, record_audio; } ServiceState; std::unordered_map service_state; std::unordered_map remote_encoders; @@ -239,6 +240,7 @@ void loggerd_thread() { .freq = it.decimation, .encoder = encoder, .user_flag = it.name == "userFlag", + .record_audio = record_audio, }; } } @@ -249,10 +251,20 @@ void loggerd_thread() { Params().put("CurrentRoute", s.logger.routeName()); std::map encoder_infos_dict; + std::vector encoders_with_audio; for (const auto &cam : cameras_logged) { for (const auto &encoder_info : cam.encoder_infos) { encoder_infos_dict[encoder_info.publish_name] = encoder_info; s.max_waiting++; + + if (encoder_info.include_audio) { + for (auto& [sock, service] : service_state) { + if (service.name == encoder_info.publish_name) { + encoders_with_audio.push_back(&remote_encoders[sock]); + break; + } + } + } } } @@ -276,6 +288,16 @@ void loggerd_thread() { if (service.encoder) { s.last_camera_seen_tms = millis_since_boot(); bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]); + } else if (service.record_audio) { + capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word))); + auto event = cmsg.getRoot(); + auto audio_data = event.getAudioData(); + for (auto* encoder : encoders_with_audio) { + if (encoder && encoder->writer) { + encoder->writer->write_audio(audio_data, event.getLogMonoTime()); + } + } + delete msg; } else { s.logger.write((uint8_t *)msg->getData(), msg->getSize(), in_qlog); bytes_count += msg->getSize(); diff --git a/system/loggerd/loggerd.h b/system/loggerd/loggerd.h index 27d2d37fc42ad5..5dfb178fd5fefa 100644 --- a/system/loggerd/loggerd.h +++ b/system/loggerd/loggerd.h @@ -35,6 +35,7 @@ class EncoderInfo { const char *thumbnail_name = NULL; const char *filename = NULL; bool record = true; + bool include_audio = false; int frame_width = -1; int frame_height = -1; int fps = MAIN_FPS; @@ -106,6 +107,7 @@ const EncoderInfo qcam_encoder_info = { .encode_type = cereal::EncodeIndex::Type::QCAMERA_H264, .frame_width = 526, .frame_height = 330, + .include_audio = Params().getBool("RecordAudio"), INIT_ENCODE_FUNCTIONS(QRoadEncode), }; diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 90b5f1af3d1b94..6a35ea3a47797b 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -5,7 +5,7 @@ #include "common/swaglog.h" #include "common/util.h" -VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec) +VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio) : remuxing(remuxing) { vid_path = util::string_format("%s/%s", path, filename); lock_path = util::string_format("%s/%s.lock", path, filename); @@ -41,6 +41,44 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL); assert(this->out_stream); + if (has_audio) { + if (this->ofmt_ctx->oformat->audio_codec == AV_CODEC_ID_NONE) { + LOGE("Output format '%s' does not support audio streams, continuing without audio. Please change the output format or the set include_audio to false.", this->ofmt_ctx->oformat->name); + } else { + const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); + assert(audio_avcodec); + this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec); + assert(this->audio_codec_ctx); + this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; + this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py + this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; + this->audio_codec_ctx->bit_rate = 32000; + this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + + int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); + assert(err >= 0); + + this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); + assert(this->audio_stream); + err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); + assert(err >= 0); + this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate}; + + this->audio_frame = av_frame_alloc(); + assert(this->audio_frame); + this->audio_frame->format = this->audio_codec_ctx->sample_fmt; + this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; + this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; + this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; + int ret = av_frame_get_buffer(this->audio_frame, 0); + if (ret < 0) { + LOGE("AUDIO: Failed to allocate frame buffer: %d", ret); + av_frame_free(&this->audio_frame); + this->audio_frame = nullptr; + } + } + } + int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE); assert(err >= 0); @@ -77,6 +115,7 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc av_init_packet(&pkt); pkt.data = data; pkt.size = len; + pkt.stream_index = this->out_stream->index; enum AVRounding rnd = static_cast(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX); pkt.pts = pkt.dts = av_rescale_q_rnd(timestamp, in_timebase, ofmt_ctx->streams[0]->time_base, rnd); @@ -95,11 +134,77 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } } +void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime) { + if (!this->remuxing || !this->audio_codec_ctx) return; + + // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet + if (this->first_audio_logMonoTime == 0) { + this->first_audio_logMonoTime = logMonoTime; + } + + // convert s16le samples to fltp and add to buffer + auto data = audio_data.getData(); + const int16_t *raw_samples = reinterpret_cast(data.begin()); + for (int i = 0; i < audio_data.getLength(); i++) { + this->audio_buffer.push_back(raw_samples[i] / 32768.0f); + } + this->buffered_samples += audio_data.getLength(); + + // only encode/write when we have enough samples for the encoder + while (this->buffered_samples >= this->audio_codec_ctx->frame_size) { + this->audio_frame->pts = this->next_audio_pts; + + float *f_samples = reinterpret_cast(this->audio_frame->data[0]); + for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) { + f_samples[i] = this->audio_buffer[i]; + } + + // remove used samples from buffer + for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) { + this->audio_buffer.pop_front(); + } + this->buffered_samples -= this->audio_codec_ctx->frame_size; + + // encode frames + int send_result = avcodec_send_frame(this->audio_codec_ctx, this->audio_frame); + if (send_result >= 0) { + AVPacket *pkt = av_packet_alloc(); + while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) { + // calculate and rescale timestamp based on the current frame's PTS + uint64_t total_samples = this->audio_frame->pts; + uint64_t time_diff_ns = (total_samples * 1000000000ULL) / this->audio_codec_ctx->sample_rate; + uint64_t synchronized_mono_time = this->first_audio_logMonoTime + time_diff_ns; + uint64_t timestamp_us = synchronized_mono_time / 1000; + AVRational in_timebase = {1, 1000000}; + int64_t pts = av_rescale_q(timestamp_us, in_timebase, this->audio_stream->time_base); + + this->last_audio_pts = std::max(pts, this->last_audio_pts + 1); // Ensure PTS is monotonically increasing to prevent TS discontinuities + + pkt->pts = pkt->dts = this->last_audio_pts; + pkt->stream_index = this->audio_stream->index; + + // write encoded frames + int err = av_interleaved_write_frame(this->ofmt_ctx, pkt); + if (err < 0) { + LOGW("AUDIO: Write frame failed - error: %d", err); + } + av_packet_unref(pkt); + } + av_packet_free(&pkt); + } else { + LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result); + } + this->next_audio_pts += this->audio_codec_ctx->frame_size; + } +} + VideoWriter::~VideoWriter() { if (this->remuxing) { int err = av_write_trailer(this->ofmt_ctx); if (err != 0) LOGE("av_write_trailer failed %d", err); avcodec_free_context(&this->codec_ctx); + if (this->audio_codec_ctx) avcodec_free_context(&this->audio_codec_ctx); + if (this->audio_frame) av_frame_free(&this->audio_frame); err = avio_closep(&this->ofmt_ctx->pb); if (err != 0) LOGE("avio_closep failed %d", err); avformat_free_context(this->ofmt_ctx); diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 1aa758b42b78b0..b4a7d5ee9461ad 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -1,6 +1,7 @@ #pragma once #include +#include extern "C" { #include @@ -11,9 +12,11 @@ extern "C" { class VideoWriter { public: - VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec); + VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio); void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe); + void write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime); ~VideoWriter(); + private: std::string vid_path, lock_path; FILE *of = nullptr; @@ -21,5 +24,15 @@ class VideoWriter { AVCodecContext *codec_ctx; AVFormatContext *ofmt_ctx; AVStream *out_stream; + + AVStream *audio_stream = nullptr; + AVCodecContext *audio_codec_ctx = nullptr; + AVFrame *audio_frame = nullptr; + uint64_t next_audio_pts = 0; + int64_t last_audio_pts = 0; + uint64_t first_audio_logMonoTime = 0; + std::deque audio_buffer; + uint64_t buffered_samples = 0; + bool remuxing; }; From 1409c53041bed7a42c2a54832974675de9ead4fe Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:46:22 -0700 Subject: [PATCH 02/16] better match write_audio() with write() --- system/loggerd/loggerd.cc | 4 +-- system/loggerd/video_writer.cc | 61 ++++++++++++++-------------------- system/loggerd/video_writer.h | 5 ++- 3 files changed, 29 insertions(+), 41 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 348e1ec78bf99b..4b230e3953843d 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -291,10 +291,10 @@ void loggerd_thread() { } else if (service.record_audio) { capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word))); auto event = cmsg.getRoot(); - auto audio_data = event.getAudioData(); + auto audio_data = event.getAudioData().getData(); for (auto* encoder : encoders_with_audio) { if (encoder && encoder->writer) { - encoder->writer->write_audio(audio_data, event.getLogMonoTime()); + encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000); } } delete msg; diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 6a35ea3a47797b..fe813278e2ef55 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -134,57 +134,46 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } } -void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime) { - if (!this->remuxing || !this->audio_codec_ctx) return; +void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { + if (!remuxing || !audio_codec_ctx) return; // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet - if (this->first_audio_logMonoTime == 0) { - this->first_audio_logMonoTime = logMonoTime; + if (first_audio_timestamp == 0) { + first_audio_timestamp = timestamp; // microseconds } // convert s16le samples to fltp and add to buffer - auto data = audio_data.getData(); - const int16_t *raw_samples = reinterpret_cast(data.begin()); - for (int i = 0; i < audio_data.getLength(); i++) { - this->audio_buffer.push_back(raw_samples[i] / 32768.0f); + const int16_t *raw_samples = reinterpret_cast(data); + int sample_count = len / sizeof(int16_t); + for (int i = 0; i < sample_count; i++) { + audio_buffer.push_back(raw_samples[i] / 32768.0f); } - this->buffered_samples += audio_data.getLength(); + buffered_samples += sample_count; - // only encode/write when we have enough samples for the encoder - while (this->buffered_samples >= this->audio_codec_ctx->frame_size) { - this->audio_frame->pts = this->next_audio_pts; + while (buffered_samples >= audio_codec_ctx->frame_size) { + audio_frame->pts = next_audio_pts; - float *f_samples = reinterpret_cast(this->audio_frame->data[0]); - for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) { - f_samples[i] = this->audio_buffer[i]; + float *f_samples = reinterpret_cast(audio_frame->data[0]); + for (int i = 0; i < audio_codec_ctx->frame_size; i++) { + f_samples[i] = audio_buffer[i]; } - // remove used samples from buffer - for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) { - this->audio_buffer.pop_front(); + for (int i = 0; i < audio_codec_ctx->frame_size; i++) { + audio_buffer.pop_front(); } - this->buffered_samples -= this->audio_codec_ctx->frame_size; + buffered_samples -= audio_codec_ctx->frame_size; - // encode frames - int send_result = avcodec_send_frame(this->audio_codec_ctx, this->audio_frame); + int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames if (send_result >= 0) { AVPacket *pkt = av_packet_alloc(); - while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) { - // calculate and rescale timestamp based on the current frame's PTS - uint64_t total_samples = this->audio_frame->pts; - uint64_t time_diff_ns = (total_samples * 1000000000ULL) / this->audio_codec_ctx->sample_rate; - uint64_t synchronized_mono_time = this->first_audio_logMonoTime + time_diff_ns; - uint64_t timestamp_us = synchronized_mono_time / 1000; + while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) { + uint64_t time_diff_us = (audio_frame->pts * 1000000ULL) / audio_codec_ctx->sample_rate; + uint64_t synchronized_time = first_audio_timestamp + time_diff_us; AVRational in_timebase = {1, 1000000}; - int64_t pts = av_rescale_q(timestamp_us, in_timebase, this->audio_stream->time_base); + pkt->pts = pkt->dts = av_rescale_q(synchronized_time, in_timebase, audio_stream->time_base); + pkt->stream_index = audio_stream->index; - this->last_audio_pts = std::max(pts, this->last_audio_pts + 1); // Ensure PTS is monotonically increasing to prevent TS discontinuities - - pkt->pts = pkt->dts = this->last_audio_pts; - pkt->stream_index = this->audio_stream->index; - - // write encoded frames - int err = av_interleaved_write_frame(this->ofmt_ctx, pkt); + int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frames if (err < 0) { LOGW("AUDIO: Write frame failed - error: %d", err); } @@ -194,7 +183,7 @@ void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint6 } else { LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result); } - this->next_audio_pts += this->audio_codec_ctx->frame_size; + next_audio_pts += audio_codec_ctx->frame_size; } } diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index b4a7d5ee9461ad..ef3fb56213ae3e 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -14,7 +14,7 @@ class VideoWriter { public: VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio); void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe); - void write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime); + void write_audio(uint8_t *data, int len, long long timestamp); ~VideoWriter(); private: @@ -29,8 +29,7 @@ class VideoWriter { AVCodecContext *audio_codec_ctx = nullptr; AVFrame *audio_frame = nullptr; uint64_t next_audio_pts = 0; - int64_t last_audio_pts = 0; - uint64_t first_audio_logMonoTime = 0; + uint64_t first_audio_timestamp = 0; std::deque audio_buffer; uint64_t buffered_samples = 0; From d0e9a1e0683146f54b872331153c9b0a3f7e2bc0 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:47:51 -0700 Subject: [PATCH 03/16] handle different FFmpeg versions, flush audio encoder, suppress encoder QAvg/info messages --- system/loggerd/video_writer.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index fe813278e2ef55..6b89f4a7e83d76 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -51,12 +51,17 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, assert(this->audio_codec_ctx); this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1); + #else this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; + #endif this->audio_codec_ctx->bit_rate = 32000; this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); assert(err >= 0); + av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); assert(this->audio_stream); @@ -67,7 +72,11 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, this->audio_frame = av_frame_alloc(); assert(this->audio_frame); this->audio_frame->format = this->audio_codec_ctx->sample_fmt; + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout); + #else this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; + #endif this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; int ret = av_frame_get_buffer(this->audio_frame, 0); @@ -189,10 +198,16 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { VideoWriter::~VideoWriter() { if (this->remuxing) { + if (this->audio_codec_ctx) { // flush audio encoder + avcodec_send_frame(this->audio_codec_ctx, NULL); + AVPacket *pkt = av_packet_alloc(); + while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) av_packet_unref(pkt); + av_packet_free(&pkt); + avcodec_free_context(&this->audio_codec_ctx); + } int err = av_write_trailer(this->ofmt_ctx); if (err != 0) LOGE("av_write_trailer failed %d", err); avcodec_free_context(&this->codec_ctx); - if (this->audio_codec_ctx) avcodec_free_context(&this->audio_codec_ctx); if (this->audio_frame) av_frame_free(&this->audio_frame); err = avio_closep(&this->ofmt_ctx->pb); if (err != 0) LOGE("avio_closep failed %d", err); From 70d3dc24e3e5e27b2720baac5586d4415ee5664f Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:46:51 -0700 Subject: [PATCH 04/16] use audio_buffer.size() instead of keeping track of size separately --- system/loggerd/video_writer.cc | 4 +--- system/loggerd/video_writer.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 6b89f4a7e83d76..241cf99c61b1a9 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -157,9 +157,8 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { for (int i = 0; i < sample_count; i++) { audio_buffer.push_back(raw_samples[i] / 32768.0f); } - buffered_samples += sample_count; - while (buffered_samples >= audio_codec_ctx->frame_size) { + while (audio_buffer.size() >= audio_codec_ctx->frame_size) { audio_frame->pts = next_audio_pts; float *f_samples = reinterpret_cast(audio_frame->data[0]); @@ -170,7 +169,6 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { for (int i = 0; i < audio_codec_ctx->frame_size; i++) { audio_buffer.pop_front(); } - buffered_samples -= audio_codec_ctx->frame_size; int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames if (send_result >= 0) { diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index ef3fb56213ae3e..986de9780b1808 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -31,7 +31,6 @@ class VideoWriter { uint64_t next_audio_pts = 0; uint64_t first_audio_timestamp = 0; std::deque audio_buffer; - uint64_t buffered_samples = 0; bool remuxing; }; From 1b994cab96d96f6671933a25d628a639d4847f37 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:40:23 -0700 Subject: [PATCH 05/16] no more for loops --- system/loggerd/video_writer.cc | 18 ++++++++---------- system/loggerd/video_writer.h | 4 ++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 241cf99c61b1a9..0f50f1f5a9b7d0 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -154,21 +154,19 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { // convert s16le samples to fltp and add to buffer const int16_t *raw_samples = reinterpret_cast(data); int sample_count = len / sizeof(int16_t); - for (int i = 0; i < sample_count; i++) { - audio_buffer.push_back(raw_samples[i] / 32768.0f); - } + audio_buffer.reserve(audio_buffer.size() + sample_count); + constexpr float normalizer = 1.0f / 32768.0f; + std::transform(raw_samples, raw_samples + sample_count, std::back_inserter(audio_buffer), + [](int16_t sample) { + return sample * normalizer; + }); while (audio_buffer.size() >= audio_codec_ctx->frame_size) { audio_frame->pts = next_audio_pts; float *f_samples = reinterpret_cast(audio_frame->data[0]); - for (int i = 0; i < audio_codec_ctx->frame_size; i++) { - f_samples[i] = audio_buffer[i]; - } - - for (int i = 0; i < audio_codec_ctx->frame_size; i++) { - audio_buffer.pop_front(); - } + std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples); + audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size); int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames if (send_result >= 0) { diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 986de9780b1808..110fba9b97b75f 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include extern "C" { #include @@ -30,7 +30,7 @@ class VideoWriter { AVFrame *audio_frame = nullptr; uint64_t next_audio_pts = 0; uint64_t first_audio_timestamp = 0; - std::deque audio_buffer; + std::vector audio_buffer; bool remuxing; }; From 9d01fb9832f019d0e8f7ca136a0df130eb0707a7 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 26 Jun 2025 13:55:21 -0700 Subject: [PATCH 06/16] save to qcam and rlog --- system/loggerd/loggerd.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 4b230e3953843d..090831a6d8b7e4 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -285,10 +285,8 @@ void loggerd_thread() { Message *msg = nullptr; while (!do_exit && (msg = sock->receive(true))) { const bool in_qlog = service.freq != -1 && (service.counter++ % service.freq == 0); - if (service.encoder) { - s.last_camera_seen_tms = millis_since_boot(); - bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]); - } else if (service.record_audio) { + + if (service.record_audio) { capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word))); auto event = cmsg.getRoot(); auto audio_data = event.getAudioData().getData(); @@ -297,7 +295,11 @@ void loggerd_thread() { encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000); } } - delete msg; + } + + if (service.encoder) { + s.last_camera_seen_tms = millis_since_boot(); + bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]); } else { s.logger.write((uint8_t *)msg->getData(), msg->getSize(), in_qlog); bytes_count += msg->getSize(); From 488844955f75053931a3f20829eef7cd4c511dcd Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 26 Jun 2025 16:53:25 -0700 Subject: [PATCH 07/16] assert audio support check --- system/loggerd/video_writer.cc | 83 ++++++++++++++++------------------ 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 0f50f1f5a9b7d0..d246a00d4e4288 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -42,49 +42,46 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, assert(this->out_stream); if (has_audio) { - if (this->ofmt_ctx->oformat->audio_codec == AV_CODEC_ID_NONE) { - LOGE("Output format '%s' does not support audio streams, continuing without audio. Please change the output format or the set include_audio to false.", this->ofmt_ctx->oformat->name); - } else { - const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); - assert(audio_avcodec); - this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec); - assert(this->audio_codec_ctx); - this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; - this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py - #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ - av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1); - #else - this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; - #endif - this->audio_codec_ctx->bit_rate = 32000; - this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; - - int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); - assert(err >= 0); - av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment - - this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); - assert(this->audio_stream); - err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); - assert(err >= 0); - this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate}; - - this->audio_frame = av_frame_alloc(); - assert(this->audio_frame); - this->audio_frame->format = this->audio_codec_ctx->sample_fmt; - #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ - av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout); - #else - this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; - #endif - this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; - this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; - int ret = av_frame_get_buffer(this->audio_frame, 0); - if (ret < 0) { - LOGE("AUDIO: Failed to allocate frame buffer: %d", ret); - av_frame_free(&this->audio_frame); - this->audio_frame = nullptr; - } + assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams + const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); + assert(audio_avcodec); + this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec); + assert(this->audio_codec_ctx); + this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; + this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1); + #else + this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; + #endif + this->audio_codec_ctx->bit_rate = 32000; + this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + + int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); + assert(err >= 0); + av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment + + this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); + assert(this->audio_stream); + err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); + assert(err >= 0); + this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate}; + + this->audio_frame = av_frame_alloc(); + assert(this->audio_frame); + this->audio_frame->format = this->audio_codec_ctx->sample_fmt; + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout); + #else + this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; + #endif + this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; + this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; + int ret = av_frame_get_buffer(this->audio_frame, 0); + if (ret < 0) { + LOGE("AUDIO: Failed to allocate frame buffer: %d", ret); + av_frame_free(&this->audio_frame); + this->audio_frame = nullptr; } } From 78f22320ad11903c015165313688b7b650aeb9d2 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 26 Jun 2025 19:17:58 -0700 Subject: [PATCH 08/16] microphone --> soundPressure, audioData --> rawAudioData --- system/loggerd/loggerd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 090831a6d8b7e4..4cb31a6be19de1 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -289,7 +289,7 @@ void loggerd_thread() { if (service.record_audio) { capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word))); auto event = cmsg.getRoot(); - auto audio_data = event.getAudioData().getData(); + auto audio_data = event.getRawAudioData().getData(); for (auto* encoder : encoders_with_audio) { if (encoder && encoder->writer) { encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000); From 4a157c3e82758a0f23db6200bf677c76b62783cc Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 26 Jun 2025 20:01:26 -0700 Subject: [PATCH 09/16] deque much more efficient if buffer ever >> frame_size, ~ same performance for defaults --- system/loggerd/video_writer.cc | 9 ++++----- system/loggerd/video_writer.h | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index d246a00d4e4288..c6c52ba4a90821 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -151,12 +151,11 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { // convert s16le samples to fltp and add to buffer const int16_t *raw_samples = reinterpret_cast(data); int sample_count = len / sizeof(int16_t); - audio_buffer.reserve(audio_buffer.size() + sample_count); constexpr float normalizer = 1.0f / 32768.0f; - std::transform(raw_samples, raw_samples + sample_count, std::back_inserter(audio_buffer), - [](int16_t sample) { - return sample * normalizer; - }); + const size_t original_size = audio_buffer.size(); + audio_buffer.resize(original_size + sample_count); + std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size, + [](int16_t sample) { return sample * normalizer; }); while (audio_buffer.size() >= audio_codec_ctx->frame_size) { audio_frame->pts = next_audio_pts; diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 110fba9b97b75f..986de9780b1808 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include extern "C" { #include @@ -30,7 +30,7 @@ class VideoWriter { AVFrame *audio_frame = nullptr; uint64_t next_audio_pts = 0; uint64_t first_audio_timestamp = 0; - std::vector audio_buffer; + std::deque audio_buffer; bool remuxing; }; From 6b951cf2a80628ff06bb671b8fbd96b1b7bd25cd Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 14:54:50 -0700 Subject: [PATCH 10/16] cleanup and fix time scaling --- system/loggerd/loggerd.cc | 16 ++++---- system/loggerd/video_writer.cc | 70 +++++++++++++++------------------- system/loggerd/video_writer.h | 8 ++-- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 4cb31a6be19de1..3826a829ad176b 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -256,15 +256,13 @@ void loggerd_thread() { for (const auto &encoder_info : cam.encoder_infos) { encoder_infos_dict[encoder_info.publish_name] = encoder_info; s.max_waiting++; + } + } - if (encoder_info.include_audio) { - for (auto& [sock, service] : service_state) { - if (service.name == encoder_info.publish_name) { - encoders_with_audio.push_back(&remote_encoders[sock]); - break; - } - } - } + for (auto &[sock, service] : service_state) { + auto it = encoder_infos_dict.find(service.name); + if (it != encoder_infos_dict.end() && it->second.include_audio) { + encoders_with_audio.push_back(&remote_encoders[sock]); } } @@ -292,7 +290,7 @@ void loggerd_thread() { auto audio_data = event.getRawAudioData().getData(); for (auto* encoder : encoders_with_audio) { if (encoder && encoder->writer) { - encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000); + encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000); } } } diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index c6c52ba4a90821..f954ccca886036 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -5,7 +5,7 @@ #include "common/swaglog.h" #include "common/util.h" -VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio) +VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio) : remuxing(remuxing) { vid_path = util::string_format("%s/%s", path, filename); lock_path = util::string_format("%s/%s.lock", path, filename); @@ -41,7 +41,7 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL); assert(this->out_stream); - if (has_audio) { + if (include_audio) { assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); assert(audio_avcodec); @@ -56,7 +56,7 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, #endif this->audio_codec_ctx->bit_rate = 32000; this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; - + this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate}; int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); assert(err >= 0); av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment @@ -65,7 +65,6 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, assert(this->audio_stream); err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); assert(err >= 0); - this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate}; this->audio_frame = av_frame_alloc(); assert(this->audio_frame); @@ -77,12 +76,8 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, #endif this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; - int ret = av_frame_get_buffer(this->audio_frame, 0); - if (ret < 0) { - LOGE("AUDIO: Failed to allocate frame buffer: %d", ret); - av_frame_free(&this->audio_frame); - this->audio_frame = nullptr; - } + err = av_frame_get_buffer(this->audio_frame, 0); + assert(err >= 0); } int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE); @@ -143,9 +138,9 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { if (!remuxing || !audio_codec_ctx) return; - // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet - if (first_audio_timestamp == 0) { - first_audio_timestamp = timestamp; // microseconds + // sync logMonoTime of first audio packet with the timestampEof of first video packet + if (audio_pts == 0) { + audio_pts = (timestamp * audio_codec_ctx->sample_rate) / 1000000ULL; } // convert s16le samples to fltp and add to buffer @@ -158,43 +153,40 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { [](int16_t sample) { return sample * normalizer; }); while (audio_buffer.size() >= audio_codec_ctx->frame_size) { - audio_frame->pts = next_audio_pts; - + audio_frame->pts = audio_pts; float *f_samples = reinterpret_cast(audio_frame->data[0]); std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples); audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size); + encode_and_write_audio_frame(audio_frame); + } +} - int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames - if (send_result >= 0) { - AVPacket *pkt = av_packet_alloc(); - while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) { - uint64_t time_diff_us = (audio_frame->pts * 1000000ULL) / audio_codec_ctx->sample_rate; - uint64_t synchronized_time = first_audio_timestamp + time_diff_us; - AVRational in_timebase = {1, 1000000}; - pkt->pts = pkt->dts = av_rescale_q(synchronized_time, in_timebase, audio_stream->time_base); - pkt->stream_index = audio_stream->index; - - int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frames - if (err < 0) { - LOGW("AUDIO: Write frame failed - error: %d", err); - } - av_packet_unref(pkt); +void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) { + if (!remuxing || !audio_codec_ctx) return; + int send_result = avcodec_send_frame(audio_codec_ctx, frame); // encode frame + if (send_result >= 0) { + AVPacket *pkt = av_packet_alloc(); + while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) { + av_packet_rescale_ts(pkt, audio_codec_ctx->time_base, audio_stream->time_base); + pkt->stream_index = audio_stream->index; + + int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frame + if (err < 0) { + LOGW("AUDIO: Write frame failed - error: %d", err); } - av_packet_free(&pkt); - } else { - LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result); } - next_audio_pts += audio_codec_ctx->frame_size; + av_packet_free(&pkt); + } else { + LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result); } + audio_pts += audio_codec_ctx->frame_size; } + VideoWriter::~VideoWriter() { if (this->remuxing) { - if (this->audio_codec_ctx) { // flush audio encoder - avcodec_send_frame(this->audio_codec_ctx, NULL); - AVPacket *pkt = av_packet_alloc(); - while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) av_packet_unref(pkt); - av_packet_free(&pkt); + if (this->audio_codec_ctx) { + encode_and_write_audio_frame(NULL); // flush encoder avcodec_free_context(&this->audio_codec_ctx); } int err = av_write_trailer(this->ofmt_ctx); diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 986de9780b1808..724acf0a3877c2 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -12,12 +12,15 @@ extern "C" { class VideoWriter { public: - VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio); + VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio); void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe); void write_audio(uint8_t *data, int len, long long timestamp); + ~VideoWriter(); private: + void encode_and_write_audio_frame(AVFrame* frame); + std::string vid_path, lock_path; FILE *of = nullptr; @@ -28,8 +31,7 @@ class VideoWriter { AVStream *audio_stream = nullptr; AVCodecContext *audio_codec_ctx = nullptr; AVFrame *audio_frame = nullptr; - uint64_t next_audio_pts = 0; - uint64_t first_audio_timestamp = 0; + uint64_t audio_pts = 0; std::deque audio_buffer; bool remuxing; From 15694516940a83edcfbff01ddd717c41b1f2b18d Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 20:57:30 -0700 Subject: [PATCH 11/16] initialize audio separately and pass sample_rate in --- system/loggerd/loggerd.cc | 29 ++++++---- system/loggerd/video_writer.cc | 96 ++++++++++++++++++---------------- system/loggerd/video_writer.h | 6 ++- 3 files changed, 74 insertions(+), 57 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index 3826a829ad176b..de8e186e8942bc 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -62,6 +62,7 @@ struct RemoteEncoder { bool recording = false; bool marked_ready_to_rotate = false; bool seen_first_packet = false; + bool audio_initialized = false; }; size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEncoder &re, const EncoderInfo &encoder_info) { @@ -80,11 +81,6 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc } // if we aren't actually recording, don't create the writer if (encoder_info.record) { - assert(encoder_info.filename != NULL); - re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(), - encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C, - edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType(), - encoder_info.include_audio)); // write the header auto header = edata.getHeader(); re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false); @@ -139,13 +135,19 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct // if this is a new segment, we close any possible old segments, move to the new, and process any queued packets if (re.current_segment != s->logger.segment()) { - if (re.recording) { - re.writer.reset(); + if (encoder_info.record) { + assert(encoder_info.filename != NULL); + re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(), + encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C, + edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType())); re.recording = false; + re.audio_initialized = false; } re.current_segment = s->logger.segment(); re.marked_ready_to_rotate = false; // we are in this segment now, process any queued messages before this one + } + if (re.audio_initialized || !encoder_info.include_audio) { if (!re.q.empty()) { for (auto qmsg : re.q) { capnp::FlatArrayMessageReader reader({(capnp::word *)qmsg->getData(), qmsg->getSize() / sizeof(capnp::word)}); @@ -154,9 +156,14 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct } re.q.clear(); } + bytes_count += write_encode_data(s, event, re, encoder_info); + delete msg; + } else if (re.q.size() > MAIN_FPS*10) { + LOGE_100("%s: dropping frame waiting for audio initialization, queue is too large", name.c_str()); + delete msg; + } else { + re.q.push_back(msg); // queue up all the new segment messages, they go in after audio is initialized } - bytes_count += write_encode_data(s, event, re, encoder_info); - delete msg; } else if (offset_segment_num > s->logger.segment()) { // encoderd packet has a newer segment, this means encoderd has rolled over if (!re.marked_ready_to_rotate) { @@ -288,9 +295,11 @@ void loggerd_thread() { capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word))); auto event = cmsg.getRoot(); auto audio_data = event.getRawAudioData().getData(); + auto sample_rate = event.getRawAudioData().getSampleRate(); for (auto* encoder : encoders_with_audio) { if (encoder && encoder->writer) { - encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000); + encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000, sample_rate); + encoder->audio_initialized = true; } } } diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index f954ccca886036..960b5a14c1f503 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -5,7 +5,7 @@ #include "common/swaglog.h" #include "common/util.h" -VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio) +VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec) : remuxing(remuxing) { vid_path = util::string_format("%s/%s", path, filename); lock_path = util::string_format("%s/%s.lock", path, filename); @@ -41,45 +41,6 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL); assert(this->out_stream); - if (include_audio) { - assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams - const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); - assert(audio_avcodec); - this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec); - assert(this->audio_codec_ctx); - this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; - this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py - #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ - av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1); - #else - this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; - #endif - this->audio_codec_ctx->bit_rate = 32000; - this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; - this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate}; - int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); - assert(err >= 0); - av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment - - this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); - assert(this->audio_stream); - err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); - assert(err >= 0); - - this->audio_frame = av_frame_alloc(); - assert(this->audio_frame); - this->audio_frame->format = this->audio_codec_ctx->sample_fmt; - #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ - av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout); - #else - this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; - #endif - this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; - this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; - err = av_frame_get_buffer(this->audio_frame, 0); - assert(err >= 0); - } - int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE); assert(err >= 0); @@ -89,6 +50,45 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, } } +void VideoWriter::initialize_audio(int sample_rate) { + assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams + const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC); + assert(audio_avcodec); + this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec); + assert(this->audio_codec_ctx); + this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP; + this->audio_codec_ctx->sample_rate = sample_rate; + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1); + #else + this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO; + #endif + this->audio_codec_ctx->bit_rate = 32000; + this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate}; + int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL); + assert(err >= 0); + av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment + + this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL); + assert(this->audio_stream); + err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx); + assert(err >= 0); + + this->audio_frame = av_frame_alloc(); + assert(this->audio_frame); + this->audio_frame->format = this->audio_codec_ctx->sample_fmt; + #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+ + av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout); + #else + this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout; + #endif + this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate; + this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size; + err = av_frame_get_buffer(this->audio_frame, 0); + assert(err >= 0); +} + void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe) { if (of && data) { size_t written = util::safe_fwrite(data, 1, len, of); @@ -106,9 +106,9 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx); assert(err >= 0); - err = avformat_write_header(ofmt_ctx, NULL); - assert(err >= 0); - } else { + err = avformat_write_header(ofmt_ctx, NULL); + assert(err >= 0); + } else { // input timestamps are in microseconds AVRational in_timebase = {1, 1000000}; @@ -135,8 +135,13 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } } -void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) { - if (!remuxing || !audio_codec_ctx) return; +void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int sample_rate) { + if (!remuxing) return; + if (!audio_initialized) { + initialize_audio(sample_rate); + audio_initialized = true; + } + if (!audio_codec_ctx) return; // sync logMonoTime of first audio packet with the timestampEof of first video packet if (audio_pts == 0) { @@ -174,6 +179,7 @@ void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) { if (err < 0) { LOGW("AUDIO: Write frame failed - error: %d", err); } + av_packet_unref(pkt); } av_packet_free(&pkt); } else { diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 724acf0a3877c2..09f190575e5bdd 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -12,13 +12,14 @@ extern "C" { class VideoWriter { public: - VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio); + VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec); void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe); - void write_audio(uint8_t *data, int len, long long timestamp); + void write_audio(uint8_t *data, int len, long long timestamp, int sample_rate); ~VideoWriter(); private: + void initialize_audio(int sample_rate); void encode_and_write_audio_frame(AVFrame* frame); std::string vid_path, lock_path; @@ -28,6 +29,7 @@ class VideoWriter { AVFormatContext *ofmt_ctx; AVStream *out_stream; + bool audio_initialized = false; AVStream *audio_stream = nullptr; AVCodecContext *audio_codec_ctx = nullptr; AVFrame *audio_frame = nullptr; From 5f6646b35c8a00015a5f105f7c38c5940a518e83 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 21:11:55 -0700 Subject: [PATCH 12/16] update comments --- system/loggerd/loggerd.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc index de8e186e8942bc..144ab9f34955e2 100644 --- a/system/loggerd/loggerd.cc +++ b/system/loggerd/loggerd.cc @@ -79,7 +79,6 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc LOGW("%s: dropped %d non iframe packets before init", encoder_info.publish_name, re.dropped_frames); re.dropped_frames = 0; } - // if we aren't actually recording, don't create the writer if (encoder_info.record) { // write the header auto header = edata.getHeader(); @@ -135,6 +134,7 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct // if this is a new segment, we close any possible old segments, move to the new, and process any queued packets if (re.current_segment != s->logger.segment()) { + // if we aren't actually recording, don't create the writer if (encoder_info.record) { assert(encoder_info.filename != NULL); re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(), @@ -145,9 +145,9 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct } re.current_segment = s->logger.segment(); re.marked_ready_to_rotate = false; - // we are in this segment now, process any queued messages before this one } if (re.audio_initialized || !encoder_info.include_audio) { + // we are in this segment now, process any queued messages before this one if (!re.q.empty()) { for (auto qmsg : re.q) { capnp::FlatArrayMessageReader reader({(capnp::word *)qmsg->getData(), qmsg->getSize() / sizeof(capnp::word)}); From 6b751f8d51588d5cf643b7a9ee56a8f6b37f988e Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 21:53:08 -0700 Subject: [PATCH 13/16] ensure header is written before writing audio --- system/loggerd/video_writer.cc | 9 ++++++--- system/loggerd/video_writer.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 960b5a14c1f503..e90fbd2c97d64e 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -106,9 +106,11 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx); assert(err >= 0); - err = avformat_write_header(ofmt_ctx, NULL); - assert(err >= 0); - } else { + // if there is an audio stream, it must be intialized before this point + err = avformat_write_header(ofmt_ctx, NULL); + assert(err >= 0); + header_written = true; + } else { // input timestamps are in microseconds AVRational in_timebase = {1, 1000000}; @@ -142,6 +144,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s audio_initialized = true; } if (!audio_codec_ctx) return; + if (!header_written) return; // header not written yet, skip processing audio frame // sync logMonoTime of first audio packet with the timestampEof of first video packet if (audio_pts == 0) { diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h index 09f190575e5bdd..25e6484d58e8c5 100644 --- a/system/loggerd/video_writer.h +++ b/system/loggerd/video_writer.h @@ -30,6 +30,7 @@ class VideoWriter { AVStream *out_stream; bool audio_initialized = false; + bool header_written = false; AVStream *audio_stream = nullptr; AVCodecContext *audio_codec_ctx = nullptr; AVFrame *audio_frame = nullptr; From 95d0b5a78fe2253fe59a4e5659a0995e0ec140d6 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 22:09:29 -0700 Subject: [PATCH 14/16] buffer audio frame but do not process before header written --- system/loggerd/video_writer.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index e90fbd2c97d64e..43d66626392851 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -144,8 +144,6 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s audio_initialized = true; } if (!audio_codec_ctx) return; - if (!header_written) return; // header not written yet, skip processing audio frame - // sync logMonoTime of first audio packet with the timestampEof of first video packet if (audio_pts == 0) { audio_pts = (timestamp * audio_codec_ctx->sample_rate) / 1000000ULL; @@ -160,6 +158,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size, [](int16_t sample) { return sample * normalizer; }); + if (!header_written) return; // header not written yet, process audio frame after header is written while (audio_buffer.size() >= audio_codec_ctx->frame_size) { audio_frame->pts = audio_pts; float *f_samples = reinterpret_cast(audio_frame->data[0]); From 6f36b16c961796d91b010e4472da41a6942fe8db Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 22:50:44 -0700 Subject: [PATCH 15/16] handle buffer overflow now that we are using as an actual buffer --- system/loggerd/video_writer.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 43d66626392851..64fc170cef4cf9 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -153,6 +153,16 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s const int16_t *raw_samples = reinterpret_cast(data); int sample_count = len / sizeof(int16_t); constexpr float normalizer = 1.0f / 32768.0f; + + const size_t max_buffer_size = sample_rate * 10; // 10 seconds + if (audio_buffer.size() + sample_count > max_buffer_size) { + size_t samples_to_drop = (audio_buffer.size() + sample_count) - max_buffer_size; + LOGE("Audio buffer overflow, dropping %zu oldest samples", samples_to_drop); + audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + samples_to_drop); + audio_pts += samples_to_drop; + } + + // Add new samples to the buffer const size_t original_size = audio_buffer.size(); audio_buffer.resize(original_size + sample_count); std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size, @@ -162,7 +172,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s while (audio_buffer.size() >= audio_codec_ctx->frame_size) { audio_frame->pts = audio_pts; float *f_samples = reinterpret_cast(audio_frame->data[0]); - std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples); + std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples); audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size); encode_and_write_audio_frame(audio_frame); } From cf6c3073c35243912bbb972b57d057cd0d2b4711 Mon Sep 17 00:00:00 2001 From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com> Date: Thu, 3 Jul 2025 22:54:17 -0700 Subject: [PATCH 16/16] spelling --- system/loggerd/video_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc index 64fc170cef4cf9..68e870982f3d3c 100644 --- a/system/loggerd/video_writer.cc +++ b/system/loggerd/video_writer.cc @@ -106,7 +106,7 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc } int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx); assert(err >= 0); - // if there is an audio stream, it must be intialized before this point + // if there is an audio stream, it must be initialized before this point err = avformat_write_header(ofmt_ctx, NULL); assert(err >= 0); header_written = true;