From aa2aa1c0157ef84ba359a5c55f3c227465768e0d Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Tue, 24 Jun 2025 23:46:44 -0700
Subject: [PATCH 01/16] encode/store audio as part of video file

---
 system/loggerd/loggerd.cc      |  26 +++++++-
 system/loggerd/loggerd.h       |   2 +
 system/loggerd/video_writer.cc | 107 ++++++++++++++++++++++++++++++++-
 system/loggerd/video_writer.h  |  15 ++++-
 4 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 898216e5b6f6e0..348e1ec78bf99b 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -83,7 +83,8 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc
         assert(encoder_info.filename != NULL);
         re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
                                         encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
-                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType()));
+                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType(),
+                                        encoder_info.include_audio));
         // write the header
         auto header = edata.getHeader();
         re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false);
@@ -214,7 +215,7 @@ void loggerd_thread() {
   typedef struct ServiceState {
     std::string name;
     int counter, freq;
-    bool encoder, user_flag;
+    bool encoder, user_flag, record_audio;
   } ServiceState;
   std::unordered_map<SubSocket*, ServiceState> service_state;
   std::unordered_map<SubSocket*, struct RemoteEncoder> remote_encoders;
@@ -239,6 +240,7 @@ void loggerd_thread() {
         .freq = it.decimation,
         .encoder = encoder,
         .user_flag = it.name == "userFlag",
+        .record_audio = record_audio,
       };
     }
   }
@@ -249,10 +251,20 @@ void loggerd_thread() {
   Params().put("CurrentRoute", s.logger.routeName());
 
   std::map<std::string, EncoderInfo> encoder_infos_dict;
+  std::vector<RemoteEncoder*> encoders_with_audio;
   for (const auto &cam : cameras_logged) {
     for (const auto &encoder_info : cam.encoder_infos) {
       encoder_infos_dict[encoder_info.publish_name] = encoder_info;
       s.max_waiting++;
+
+      if (encoder_info.include_audio) {
+        for (auto& [sock, service] : service_state) {
+          if (service.name == encoder_info.publish_name) {
+            encoders_with_audio.push_back(&remote_encoders[sock]);
+            break;
+          }
+        }
+      }
     }
   }
 
@@ -276,6 +288,16 @@ void loggerd_thread() {
         if (service.encoder) {
           s.last_camera_seen_tms = millis_since_boot();
           bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]);
+        } else if (service.record_audio) {
+          capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
+          auto event = cmsg.getRoot<cereal::Event>();
+          auto audio_data = event.getAudioData();
+          for (auto* encoder : encoders_with_audio) {
+            if (encoder && encoder->writer) {
+              encoder->writer->write_audio(audio_data, event.getLogMonoTime());
+            }
+          }
+          delete msg;
         } else {
           s.logger.write((uint8_t *)msg->getData(), msg->getSize(), in_qlog);
           bytes_count += msg->getSize();
diff --git a/system/loggerd/loggerd.h b/system/loggerd/loggerd.h
index 27d2d37fc42ad5..5dfb178fd5fefa 100644
--- a/system/loggerd/loggerd.h
+++ b/system/loggerd/loggerd.h
@@ -35,6 +35,7 @@ class EncoderInfo {
   const char *thumbnail_name = NULL;
   const char *filename = NULL;
   bool record = true;
+  bool include_audio = false;
   int frame_width = -1;
   int frame_height = -1;
   int fps = MAIN_FPS;
@@ -106,6 +107,7 @@ const EncoderInfo qcam_encoder_info = {
   .encode_type = cereal::EncodeIndex::Type::QCAMERA_H264,
   .frame_width = 526,
   .frame_height = 330,
+  .include_audio = Params().getBool("RecordAudio"),
   INIT_ENCODE_FUNCTIONS(QRoadEncode),
 };
 
diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 90b5f1af3d1b94..6a35ea3a47797b 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -5,7 +5,7 @@
 #include "common/swaglog.h"
 #include "common/util.h"
 
-VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec)
+VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio)
   : remuxing(remuxing) {
   vid_path = util::string_format("%s/%s", path, filename);
   lock_path = util::string_format("%s/%s.lock", path, filename);
@@ -41,6 +41,44 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
     this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL);
     assert(this->out_stream);
 
+    if (has_audio) {
+      if (this->ofmt_ctx->oformat->audio_codec == AV_CODEC_ID_NONE) {
+        LOGE("Output format '%s' does not support audio streams, continuing without audio. Please change the output format or the set include_audio to false.", this->ofmt_ctx->oformat->name);
+      } else {
+        const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
+        assert(audio_avcodec);
+        this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
+        assert(this->audio_codec_ctx);
+        this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+        this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
+        this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
+        this->audio_codec_ctx->bit_rate = 32000;
+        this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+
+        int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
+        assert(err >= 0);
+
+        this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
+        assert(this->audio_stream);
+        err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
+        assert(err >= 0);
+        this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate};
+
+        this->audio_frame = av_frame_alloc();
+        assert(this->audio_frame);
+        this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
+        this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
+        this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
+        this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
+        int ret = av_frame_get_buffer(this->audio_frame, 0);
+        if (ret < 0) {
+          LOGE("AUDIO: Failed to allocate frame buffer: %d", ret);
+          av_frame_free(&this->audio_frame);
+          this->audio_frame = nullptr;
+        }
+      }
+    }
+
     int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE);
     assert(err >= 0);
 
@@ -77,6 +115,7 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
       av_init_packet(&pkt);
       pkt.data = data;
       pkt.size = len;
+      pkt.stream_index = this->out_stream->index;
 
       enum AVRounding rnd = static_cast<enum AVRounding>(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
       pkt.pts = pkt.dts = av_rescale_q_rnd(timestamp, in_timebase, ofmt_ctx->streams[0]->time_base, rnd);
@@ -95,11 +134,77 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
   }
 }
 
+void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime) {
+  if (!this->remuxing || !this->audio_codec_ctx) return;
+
+   // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet
+  if (this->first_audio_logMonoTime == 0) {
+    this->first_audio_logMonoTime = logMonoTime;
+  }
+
+  // convert s16le samples to fltp and add to buffer
+  auto data = audio_data.getData();
+  const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data.begin());
+  for (int i = 0; i < audio_data.getLength(); i++) {
+    this->audio_buffer.push_back(raw_samples[i] / 32768.0f);
+  }
+  this->buffered_samples += audio_data.getLength();
+
+  // only encode/write when we have enough samples for the encoder
+  while (this->buffered_samples >= this->audio_codec_ctx->frame_size) {
+    this->audio_frame->pts = this->next_audio_pts;
+
+    float *f_samples = reinterpret_cast<float*>(this->audio_frame->data[0]);
+    for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) {
+      f_samples[i] = this->audio_buffer[i];
+    }
+
+    // remove used samples from buffer
+    for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) {
+      this->audio_buffer.pop_front();
+    }
+    this->buffered_samples -= this->audio_codec_ctx->frame_size;
+
+    // encode frames
+    int send_result = avcodec_send_frame(this->audio_codec_ctx, this->audio_frame);
+    if (send_result >= 0) {
+      AVPacket *pkt = av_packet_alloc();
+      while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) {
+        // calculate and rescale timestamp based on the current frame's PTS
+        uint64_t total_samples = this->audio_frame->pts;
+        uint64_t time_diff_ns = (total_samples * 1000000000ULL) / this->audio_codec_ctx->sample_rate;
+        uint64_t synchronized_mono_time = this->first_audio_logMonoTime + time_diff_ns;
+        uint64_t timestamp_us = synchronized_mono_time / 1000;
+        AVRational in_timebase = {1, 1000000};
+        int64_t pts = av_rescale_q(timestamp_us, in_timebase, this->audio_stream->time_base);
+
+        this->last_audio_pts = std::max(pts, this->last_audio_pts + 1); // Ensure PTS is monotonically increasing to prevent TS discontinuities
+
+        pkt->pts = pkt->dts = this->last_audio_pts;
+        pkt->stream_index = this->audio_stream->index;
+
+        // write encoded frames
+        int err = av_interleaved_write_frame(this->ofmt_ctx, pkt);
+        if (err < 0) {
+          LOGW("AUDIO: Write frame failed - error: %d", err);
+        }
+        av_packet_unref(pkt);
+      }
+      av_packet_free(&pkt);
+    } else {
+      LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result);
+    }
+    this->next_audio_pts += this->audio_codec_ctx->frame_size;
+  }
+}
+
 VideoWriter::~VideoWriter() {
   if (this->remuxing) {
     int err = av_write_trailer(this->ofmt_ctx);
     if (err != 0) LOGE("av_write_trailer failed %d", err);
     avcodec_free_context(&this->codec_ctx);
+    if (this->audio_codec_ctx) avcodec_free_context(&this->audio_codec_ctx);
+    if (this->audio_frame) av_frame_free(&this->audio_frame);
     err = avio_closep(&this->ofmt_ctx->pb);
     if (err != 0) LOGE("avio_closep failed %d", err);
     avformat_free_context(this->ofmt_ctx);
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 1aa758b42b78b0..b4a7d5ee9461ad 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <string>
+#include <deque>
 
 extern "C" {
 #include <libavformat/avformat.h>
@@ -11,9 +12,11 @@ extern "C" {
 
 class VideoWriter {
 public:
-  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec);
+  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio);
   void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
+  void write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime);
   ~VideoWriter();
+
 private:
   std::string vid_path, lock_path;
   FILE *of = nullptr;
@@ -21,5 +24,15 @@ class VideoWriter {
   AVCodecContext *codec_ctx;
   AVFormatContext *ofmt_ctx;
   AVStream *out_stream;
+
+  AVStream *audio_stream = nullptr;
+  AVCodecContext *audio_codec_ctx = nullptr;
+  AVFrame *audio_frame = nullptr;
+  uint64_t next_audio_pts = 0;
+  int64_t last_audio_pts = 0;
+  uint64_t first_audio_logMonoTime = 0;
+  std::deque<float> audio_buffer;
+  uint64_t buffered_samples = 0;
+
   bool remuxing;
 };

From 1409c53041bed7a42c2a54832974675de9ead4fe Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Wed, 25 Jun 2025 14:46:22 -0700
Subject: [PATCH 02/16] better match write_audio() with write()

---
 system/loggerd/loggerd.cc      |  4 +--
 system/loggerd/video_writer.cc | 61 ++++++++++++++--------------------
 system/loggerd/video_writer.h  |  5 ++-
 3 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 348e1ec78bf99b..4b230e3953843d 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -291,10 +291,10 @@ void loggerd_thread() {
         } else if (service.record_audio) {
           capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
           auto event = cmsg.getRoot<cereal::Event>();
-          auto audio_data = event.getAudioData();
+          auto audio_data = event.getAudioData().getData();
           for (auto* encoder : encoders_with_audio) {
             if (encoder && encoder->writer) {
-              encoder->writer->write_audio(audio_data, event.getLogMonoTime());
+              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000);
             }
           }
           delete msg;
diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 6a35ea3a47797b..fe813278e2ef55 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -134,57 +134,46 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
   }
 }
 
-void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime) {
-  if (!this->remuxing || !this->audio_codec_ctx) return;
+void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
+  if (!remuxing || !audio_codec_ctx) return;
 
    // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet
-  if (this->first_audio_logMonoTime == 0) {
-    this->first_audio_logMonoTime = logMonoTime;
+  if (first_audio_timestamp == 0) {
+    first_audio_timestamp = timestamp; // microseconds
   }
 
   // convert s16le samples to fltp and add to buffer
-  auto data = audio_data.getData();
-  const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data.begin());
-  for (int i = 0; i < audio_data.getLength(); i++) {
-    this->audio_buffer.push_back(raw_samples[i] / 32768.0f);
+  const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data);
+  int sample_count = len / sizeof(int16_t);
+  for (int i = 0; i < sample_count; i++) {
+    audio_buffer.push_back(raw_samples[i] / 32768.0f);
   }
-  this->buffered_samples += audio_data.getLength();
+  buffered_samples += sample_count;
 
-  // only encode/write when we have enough samples for the encoder
-  while (this->buffered_samples >= this->audio_codec_ctx->frame_size) {
-    this->audio_frame->pts = this->next_audio_pts;
+  while (buffered_samples >= audio_codec_ctx->frame_size) {
+    audio_frame->pts = next_audio_pts;
 
-    float *f_samples = reinterpret_cast<float*>(this->audio_frame->data[0]);
-    for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) {
-      f_samples[i] = this->audio_buffer[i];
+    float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
+    for (int i = 0; i < audio_codec_ctx->frame_size; i++) {
+      f_samples[i] = audio_buffer[i];
     }
 
-    // remove used samples from buffer
-    for (int i = 0; i < this->audio_codec_ctx->frame_size; i++) {
-      this->audio_buffer.pop_front();
+    for (int i = 0; i < audio_codec_ctx->frame_size; i++) {
+      audio_buffer.pop_front();
     }
-    this->buffered_samples -= this->audio_codec_ctx->frame_size;
+    buffered_samples -= audio_codec_ctx->frame_size;
 
-    // encode frames
-    int send_result = avcodec_send_frame(this->audio_codec_ctx, this->audio_frame);
+    int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames
     if (send_result >= 0) {
       AVPacket *pkt = av_packet_alloc();
-      while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) {
-        // calculate and rescale timestamp based on the current frame's PTS
-        uint64_t total_samples = this->audio_frame->pts;
-        uint64_t time_diff_ns = (total_samples * 1000000000ULL) / this->audio_codec_ctx->sample_rate;
-        uint64_t synchronized_mono_time = this->first_audio_logMonoTime + time_diff_ns;
-        uint64_t timestamp_us = synchronized_mono_time / 1000;
+      while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) {
+        uint64_t time_diff_us = (audio_frame->pts * 1000000ULL) / audio_codec_ctx->sample_rate;
+        uint64_t synchronized_time = first_audio_timestamp + time_diff_us;
         AVRational in_timebase = {1, 1000000};
-        int64_t pts = av_rescale_q(timestamp_us, in_timebase, this->audio_stream->time_base);
+        pkt->pts = pkt->dts = av_rescale_q(synchronized_time, in_timebase, audio_stream->time_base);
+        pkt->stream_index = audio_stream->index;
 
-        this->last_audio_pts = std::max(pts, this->last_audio_pts + 1); // Ensure PTS is monotonically increasing to prevent TS discontinuities
-
-        pkt->pts = pkt->dts = this->last_audio_pts;
-        pkt->stream_index = this->audio_stream->index;
-
-        // write encoded frames
-        int err = av_interleaved_write_frame(this->ofmt_ctx, pkt);
+        int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frames
         if (err < 0) {
           LOGW("AUDIO: Write frame failed - error: %d", err);
         }
@@ -194,7 +183,7 @@ void VideoWriter::write_audio(const cereal::AudioData::Reader &audio_data, uint6
     } else {
       LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result);
     }
-    this->next_audio_pts += this->audio_codec_ctx->frame_size;
+    next_audio_pts += audio_codec_ctx->frame_size;
   }
 }
 
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index b4a7d5ee9461ad..ef3fb56213ae3e 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -14,7 +14,7 @@ class VideoWriter {
 public:
   VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio);
   void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
-  void write_audio(const cereal::AudioData::Reader &audio_data, uint64_t logMonoTime);
+  void write_audio(uint8_t *data, int len, long long timestamp);
   ~VideoWriter();
 
 private:
@@ -29,8 +29,7 @@ class VideoWriter {
   AVCodecContext *audio_codec_ctx = nullptr;
   AVFrame *audio_frame = nullptr;
   uint64_t next_audio_pts = 0;
-  int64_t last_audio_pts = 0;
-  uint64_t first_audio_logMonoTime = 0;
+  uint64_t first_audio_timestamp = 0;
   std::deque<float> audio_buffer;
   uint64_t buffered_samples = 0;
 

From d0e9a1e0683146f54b872331153c9b0a3f7e2bc0 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Wed, 25 Jun 2025 14:47:51 -0700
Subject: [PATCH 03/16] handle different FFmpeg versions, flush audio encoder,
 suppress encoder QAvg/info messages

---
 system/loggerd/video_writer.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index fe813278e2ef55..6b89f4a7e83d76 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -51,12 +51,17 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
         assert(this->audio_codec_ctx);
         this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
         this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
+        #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+        av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
+        #else
         this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
+        #endif
         this->audio_codec_ctx->bit_rate = 32000;
         this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
 
         int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
         assert(err >= 0);
+        av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
 
         this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
         assert(this->audio_stream);
@@ -67,7 +72,11 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
         this->audio_frame = av_frame_alloc();
         assert(this->audio_frame);
         this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
+        #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+        av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
+        #else
         this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
+        #endif
         this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
         this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
         int ret = av_frame_get_buffer(this->audio_frame, 0);
@@ -189,10 +198,16 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
 
 VideoWriter::~VideoWriter() {
   if (this->remuxing) {
+    if (this->audio_codec_ctx) { // flush audio encoder
+      avcodec_send_frame(this->audio_codec_ctx, NULL);
+      AVPacket *pkt = av_packet_alloc();
+      while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) av_packet_unref(pkt);
+      av_packet_free(&pkt);
+      avcodec_free_context(&this->audio_codec_ctx);
+    }
     int err = av_write_trailer(this->ofmt_ctx);
     if (err != 0) LOGE("av_write_trailer failed %d", err);
     avcodec_free_context(&this->codec_ctx);
-    if (this->audio_codec_ctx) avcodec_free_context(&this->audio_codec_ctx);
     if (this->audio_frame) av_frame_free(&this->audio_frame);
     err = avio_closep(&this->ofmt_ctx->pb);
     if (err != 0) LOGE("avio_closep failed %d", err);

From 70d3dc24e3e5e27b2720baac5586d4415ee5664f Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:46:51 -0700
Subject: [PATCH 04/16] use audio_buffer.size() instead of keeping track of
 size separately

---
 system/loggerd/video_writer.cc | 4 +---
 system/loggerd/video_writer.h  | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 6b89f4a7e83d76..241cf99c61b1a9 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -157,9 +157,8 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
   for (int i = 0; i < sample_count; i++) {
     audio_buffer.push_back(raw_samples[i] / 32768.0f);
   }
-  buffered_samples += sample_count;
 
-  while (buffered_samples >= audio_codec_ctx->frame_size) {
+  while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
     audio_frame->pts = next_audio_pts;
 
     float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
@@ -170,7 +169,6 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
     for (int i = 0; i < audio_codec_ctx->frame_size; i++) {
       audio_buffer.pop_front();
     }
-    buffered_samples -= audio_codec_ctx->frame_size;
 
     int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames
     if (send_result >= 0) {
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index ef3fb56213ae3e..986de9780b1808 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -31,7 +31,6 @@ class VideoWriter {
   uint64_t next_audio_pts = 0;
   uint64_t first_audio_timestamp = 0;
   std::deque<float> audio_buffer;
-  uint64_t buffered_samples = 0;
 
   bool remuxing;
 };

From 1b994cab96d96f6671933a25d628a639d4847f37 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:40:23 -0700
Subject: [PATCH 05/16] no more for loops

---
 system/loggerd/video_writer.cc | 18 ++++++++----------
 system/loggerd/video_writer.h  |  4 ++--
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 241cf99c61b1a9..0f50f1f5a9b7d0 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -154,21 +154,19 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
   // convert s16le samples to fltp and add to buffer
   const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data);
   int sample_count = len / sizeof(int16_t);
-  for (int i = 0; i < sample_count; i++) {
-    audio_buffer.push_back(raw_samples[i] / 32768.0f);
-  }
+  audio_buffer.reserve(audio_buffer.size() + sample_count);
+  constexpr float normalizer = 1.0f / 32768.0f;
+  std::transform(raw_samples, raw_samples + sample_count, std::back_inserter(audio_buffer),
+                 [](int16_t sample) {
+                     return sample * normalizer;
+                 });
 
   while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
     audio_frame->pts = next_audio_pts;
 
     float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
-    for (int i = 0; i < audio_codec_ctx->frame_size; i++) {
-      f_samples[i] = audio_buffer[i];
-    }
-
-    for (int i = 0; i < audio_codec_ctx->frame_size; i++) {
-      audio_buffer.pop_front();
-    }
+    std::copy(audio_buffer.begin(),  audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples);
+    audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size);
 
     int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames
     if (send_result >= 0) {
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 986de9780b1808..110fba9b97b75f 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <string>
-#include <deque>
+#include <vector>
 
 extern "C" {
 #include <libavformat/avformat.h>
@@ -30,7 +30,7 @@ class VideoWriter {
   AVFrame *audio_frame = nullptr;
   uint64_t next_audio_pts = 0;
   uint64_t first_audio_timestamp = 0;
-  std::deque<float> audio_buffer;
+  std::vector<float> audio_buffer;
 
   bool remuxing;
 };

From 9d01fb9832f019d0e8f7ca136a0df130eb0707a7 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 26 Jun 2025 13:55:21 -0700
Subject: [PATCH 06/16] save to qcam and rlog

---
 system/loggerd/loggerd.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 4b230e3953843d..090831a6d8b7e4 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -285,10 +285,8 @@ void loggerd_thread() {
       Message *msg = nullptr;
       while (!do_exit && (msg = sock->receive(true))) {
         const bool in_qlog = service.freq != -1 && (service.counter++ % service.freq == 0);
-        if (service.encoder) {
-          s.last_camera_seen_tms = millis_since_boot();
-          bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]);
-        } else if (service.record_audio) {
+
+        if (service.record_audio) {
           capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
           auto event = cmsg.getRoot<cereal::Event>();
           auto audio_data = event.getAudioData().getData();
@@ -297,7 +295,11 @@ void loggerd_thread() {
               encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000);
             }
           }
-          delete msg;
+        }
+
+        if (service.encoder) {
+          s.last_camera_seen_tms = millis_since_boot();
+          bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]);
         } else {
           s.logger.write((uint8_t *)msg->getData(), msg->getSize(), in_qlog);
           bytes_count += msg->getSize();

From 488844955f75053931a3f20829eef7cd4c511dcd Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 26 Jun 2025 16:53:25 -0700
Subject: [PATCH 07/16] assert audio support check

---
 system/loggerd/video_writer.cc | 83 ++++++++++++++++------------------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 0f50f1f5a9b7d0..d246a00d4e4288 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -42,49 +42,46 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
     assert(this->out_stream);
 
     if (has_audio) {
-      if (this->ofmt_ctx->oformat->audio_codec == AV_CODEC_ID_NONE) {
-        LOGE("Output format '%s' does not support audio streams, continuing without audio. Please change the output format or the set include_audio to false.", this->ofmt_ctx->oformat->name);
-      } else {
-        const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
-        assert(audio_avcodec);
-        this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
-        assert(this->audio_codec_ctx);
-        this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-        this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
-        #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-        av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
-        #else
-        this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
-        #endif
-        this->audio_codec_ctx->bit_rate = 32000;
-        this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-
-        int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
-        assert(err >= 0);
-        av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
-
-        this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
-        assert(this->audio_stream);
-        err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
-        assert(err >= 0);
-        this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate};
-
-        this->audio_frame = av_frame_alloc();
-        assert(this->audio_frame);
-        this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
-        #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-        av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
-        #else
-        this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
-        #endif
-        this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
-        this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
-        int ret = av_frame_get_buffer(this->audio_frame, 0);
-        if (ret < 0) {
-          LOGE("AUDIO: Failed to allocate frame buffer: %d", ret);
-          av_frame_free(&this->audio_frame);
-          this->audio_frame = nullptr;
-        }
+      assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
+      const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
+      assert(audio_avcodec);
+      this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
+      assert(this->audio_codec_ctx);
+      this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+      this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
+      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+      av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
+      #else
+      this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
+      #endif
+      this->audio_codec_ctx->bit_rate = 32000;
+      this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+
+      int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
+      assert(err >= 0);
+      av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
+
+      this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
+      assert(this->audio_stream);
+      err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
+      assert(err >= 0);
+      this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate};
+
+      this->audio_frame = av_frame_alloc();
+      assert(this->audio_frame);
+      this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
+      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+      av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
+      #else
+      this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
+      #endif
+      this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
+      this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
+      int ret = av_frame_get_buffer(this->audio_frame, 0);
+      if (ret < 0) {
+        LOGE("AUDIO: Failed to allocate frame buffer: %d", ret);
+        av_frame_free(&this->audio_frame);
+        this->audio_frame = nullptr;
       }
     }
 

From 78f22320ad11903c015165313688b7b650aeb9d2 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 26 Jun 2025 19:17:58 -0700
Subject: [PATCH 08/16] microphone --> soundPressure, audioData -->
 rawAudioData

---
 system/loggerd/loggerd.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 090831a6d8b7e4..4cb31a6be19de1 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -289,7 +289,7 @@ void loggerd_thread() {
         if (service.record_audio) {
           capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
           auto event = cmsg.getRoot<cereal::Event>();
-          auto audio_data = event.getAudioData().getData();
+          auto audio_data = event.getRawAudioData().getData();
           for (auto* encoder : encoders_with_audio) {
             if (encoder && encoder->writer) {
               encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000);

From 4a157c3e82758a0f23db6200bf677c76b62783cc Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 26 Jun 2025 20:01:26 -0700
Subject: [PATCH 09/16] deque much more efficient if buffer ever >> frame_size,
 ~ same performance for defaults

---
 system/loggerd/video_writer.cc | 9 ++++-----
 system/loggerd/video_writer.h  | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index d246a00d4e4288..c6c52ba4a90821 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -151,12 +151,11 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
   // convert s16le samples to fltp and add to buffer
   const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data);
   int sample_count = len / sizeof(int16_t);
-  audio_buffer.reserve(audio_buffer.size() + sample_count);
   constexpr float normalizer = 1.0f / 32768.0f;
-  std::transform(raw_samples, raw_samples + sample_count, std::back_inserter(audio_buffer),
-                 [](int16_t sample) {
-                     return sample * normalizer;
-                 });
+  const size_t original_size = audio_buffer.size();
+  audio_buffer.resize(original_size + sample_count);
+  std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size,
+                [](int16_t sample) { return sample * normalizer; });
 
   while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
     audio_frame->pts = next_audio_pts;
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 110fba9b97b75f..986de9780b1808 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <string>
-#include <vector>
+#include <deque>
 
 extern "C" {
 #include <libavformat/avformat.h>
@@ -30,7 +30,7 @@ class VideoWriter {
   AVFrame *audio_frame = nullptr;
   uint64_t next_audio_pts = 0;
   uint64_t first_audio_timestamp = 0;
-  std::vector<float> audio_buffer;
+  std::deque<float> audio_buffer;
 
   bool remuxing;
 };

From 6b951cf2a80628ff06bb671b8fbd96b1b7bd25cd Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 14:54:50 -0700
Subject: [PATCH 10/16] cleanup and fix time scaling

---
 system/loggerd/loggerd.cc      | 16 ++++----
 system/loggerd/video_writer.cc | 70 +++++++++++++++-------------------
 system/loggerd/video_writer.h  |  8 ++--
 3 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 4cb31a6be19de1..3826a829ad176b 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -256,15 +256,13 @@ void loggerd_thread() {
     for (const auto &encoder_info : cam.encoder_infos) {
       encoder_infos_dict[encoder_info.publish_name] = encoder_info;
       s.max_waiting++;
+    }
+  }
 
-      if (encoder_info.include_audio) {
-        for (auto& [sock, service] : service_state) {
-          if (service.name == encoder_info.publish_name) {
-            encoders_with_audio.push_back(&remote_encoders[sock]);
-            break;
-          }
-        }
-      }
+  for (auto &[sock, service] : service_state) {
+    auto it = encoder_infos_dict.find(service.name);
+    if (it != encoder_infos_dict.end() && it->second.include_audio) {
+      encoders_with_audio.push_back(&remote_encoders[sock]);
     }
   }
 
@@ -292,7 +290,7 @@ void loggerd_thread() {
           auto audio_data = event.getRawAudioData().getData();
           for (auto* encoder : encoders_with_audio) {
             if (encoder && encoder->writer) {
-              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime()/1000);
+              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000);
             }
           }
         }
diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index c6c52ba4a90821..f954ccca886036 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -5,7 +5,7 @@
 #include "common/swaglog.h"
 #include "common/util.h"
 
-VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio)
+VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio)
   : remuxing(remuxing) {
   vid_path = util::string_format("%s/%s", path, filename);
   lock_path = util::string_format("%s/%s.lock", path, filename);
@@ -41,7 +41,7 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
     this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL);
     assert(this->out_stream);
 
-    if (has_audio) {
+    if (include_audio) {
       assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
       const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
       assert(audio_avcodec);
@@ -56,7 +56,7 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
       #endif
       this->audio_codec_ctx->bit_rate = 32000;
       this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-
+      this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
       int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
       assert(err >= 0);
       av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
@@ -65,7 +65,6 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
       assert(this->audio_stream);
       err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
       assert(err >= 0);
-      this->audio_stream->time_base = (AVRational){1, this->audio_codec_ctx->sample_rate};
 
       this->audio_frame = av_frame_alloc();
       assert(this->audio_frame);
@@ -77,12 +76,8 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
       #endif
       this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
       this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
-      int ret = av_frame_get_buffer(this->audio_frame, 0);
-      if (ret < 0) {
-        LOGE("AUDIO: Failed to allocate frame buffer: %d", ret);
-        av_frame_free(&this->audio_frame);
-        this->audio_frame = nullptr;
-      }
+      err = av_frame_get_buffer(this->audio_frame, 0);
+      assert(err >= 0);
     }
 
     int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE);
@@ -143,9 +138,9 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
 void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
   if (!remuxing || !audio_codec_ctx) return;
 
-   // approximately sync with video by syncing the timestampEof of first video packet with the logMonoTime of first audio packet
-  if (first_audio_timestamp == 0) {
-    first_audio_timestamp = timestamp; // microseconds
+  // sync logMonoTime of first audio packet with the timestampEof of first video packet
+  if (audio_pts == 0) {
+    audio_pts = (timestamp * audio_codec_ctx->sample_rate) / 1000000ULL;
   }
 
   // convert s16le samples to fltp and add to buffer
@@ -158,43 +153,40 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
                 [](int16_t sample) { return sample * normalizer; });
 
   while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
-    audio_frame->pts = next_audio_pts;
-
+    audio_frame->pts = audio_pts;
     float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
     std::copy(audio_buffer.begin(),  audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples);
     audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size);
+    encode_and_write_audio_frame(audio_frame);
+  }
+}
 
-    int send_result = avcodec_send_frame(audio_codec_ctx, audio_frame); // encode frames
-    if (send_result >= 0) {
-      AVPacket *pkt = av_packet_alloc();
-      while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) {
-        uint64_t time_diff_us = (audio_frame->pts * 1000000ULL) / audio_codec_ctx->sample_rate;
-        uint64_t synchronized_time = first_audio_timestamp + time_diff_us;
-        AVRational in_timebase = {1, 1000000};
-        pkt->pts = pkt->dts = av_rescale_q(synchronized_time, in_timebase, audio_stream->time_base);
-        pkt->stream_index = audio_stream->index;
-
-        int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frames
-        if (err < 0) {
-          LOGW("AUDIO: Write frame failed - error: %d", err);
-        }
-        av_packet_unref(pkt);
+void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) {
+  if (!remuxing || !audio_codec_ctx) return;
+  int send_result = avcodec_send_frame(audio_codec_ctx, frame); // encode frame
+  if (send_result >= 0) {
+    AVPacket *pkt = av_packet_alloc();
+    while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) {
+      av_packet_rescale_ts(pkt, audio_codec_ctx->time_base, audio_stream->time_base);
+      pkt->stream_index = audio_stream->index;
+
+      int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frame
+      if (err < 0) {
+        LOGW("AUDIO: Write frame failed - error: %d", err);
       }
-      av_packet_free(&pkt);
-    } else {
-      LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result);
     }
-    next_audio_pts += audio_codec_ctx->frame_size;
+    av_packet_free(&pkt);
+  } else {
+    LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result);
   }
+  audio_pts += audio_codec_ctx->frame_size;
 }
 
+
 VideoWriter::~VideoWriter() {
   if (this->remuxing) {
-    if (this->audio_codec_ctx) { // flush audio encoder
-      avcodec_send_frame(this->audio_codec_ctx, NULL);
-      AVPacket *pkt = av_packet_alloc();
-      while (avcodec_receive_packet(this->audio_codec_ctx, pkt) == 0) av_packet_unref(pkt);
-      av_packet_free(&pkt);
+    if (this->audio_codec_ctx) {
+      encode_and_write_audio_frame(NULL); // flush encoder
       avcodec_free_context(&this->audio_codec_ctx);
     }
     int err = av_write_trailer(this->ofmt_ctx);
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 986de9780b1808..724acf0a3877c2 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -12,12 +12,15 @@ extern "C" {
 
 class VideoWriter {
 public:
-  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool has_audio);
+  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio);
   void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
   void write_audio(uint8_t *data, int len, long long timestamp);
+
   ~VideoWriter();
 
 private:
+  void encode_and_write_audio_frame(AVFrame* frame);
+
   std::string vid_path, lock_path;
   FILE *of = nullptr;
 
@@ -28,8 +31,7 @@ class VideoWriter {
   AVStream *audio_stream = nullptr;
   AVCodecContext *audio_codec_ctx = nullptr;
   AVFrame *audio_frame = nullptr;
-  uint64_t next_audio_pts = 0;
-  uint64_t first_audio_timestamp = 0;
+  uint64_t audio_pts = 0;
   std::deque<float> audio_buffer;
 
   bool remuxing;

From 15694516940a83edcfbff01ddd717c41b1f2b18d Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 20:57:30 -0700
Subject: [PATCH 11/16] initialize audio separately and pass sample_rate in

---
 system/loggerd/loggerd.cc      | 29 ++++++----
 system/loggerd/video_writer.cc | 96 ++++++++++++++++++----------------
 system/loggerd/video_writer.h  |  6 ++-
 3 files changed, 74 insertions(+), 57 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index 3826a829ad176b..de8e186e8942bc 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -62,6 +62,7 @@ struct RemoteEncoder {
   bool recording = false;
   bool marked_ready_to_rotate = false;
   bool seen_first_packet = false;
+  bool audio_initialized = false;
 };
 
 size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEncoder &re, const EncoderInfo &encoder_info) {
@@ -80,11 +81,6 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc
       }
       // if we aren't actually recording, don't create the writer
       if (encoder_info.record) {
-        assert(encoder_info.filename != NULL);
-        re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
-                                        encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
-                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType(),
-                                        encoder_info.include_audio));
         // write the header
         auto header = edata.getHeader();
         re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false);
@@ -139,13 +135,19 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
 
     // if this is a new segment, we close any possible old segments, move to the new, and process any queued packets
     if (re.current_segment != s->logger.segment()) {
-      if (re.recording) {
-        re.writer.reset();
+      if (encoder_info.record) {
+        assert(encoder_info.filename != NULL);
+        re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
+                                        encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
+                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType()));
         re.recording = false;
+        re.audio_initialized = false;
       }
       re.current_segment = s->logger.segment();
       re.marked_ready_to_rotate = false;
       // we are in this segment now, process any queued messages before this one
+    }
+    if (re.audio_initialized || !encoder_info.include_audio) {
       if (!re.q.empty()) {
         for (auto qmsg : re.q) {
           capnp::FlatArrayMessageReader reader({(capnp::word *)qmsg->getData(), qmsg->getSize() / sizeof(capnp::word)});
@@ -154,9 +156,14 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
         }
         re.q.clear();
       }
+      bytes_count += write_encode_data(s, event, re, encoder_info);
+      delete msg;
+    } else if (re.q.size() > MAIN_FPS*10) {
+      LOGE_100("%s: dropping frame waiting for audio initialization, queue is too large", name.c_str());
+      delete msg;
+    } else {
+      re.q.push_back(msg); // queue up all the new segment messages, they go in after audio is initialized
     }
-    bytes_count += write_encode_data(s, event, re, encoder_info);
-    delete msg;
   } else if (offset_segment_num > s->logger.segment()) {
     // encoderd packet has a newer segment, this means encoderd has rolled over
     if (!re.marked_ready_to_rotate) {
@@ -288,9 +295,11 @@ void loggerd_thread() {
           capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
           auto event = cmsg.getRoot<cereal::Event>();
           auto audio_data = event.getRawAudioData().getData();
+          auto sample_rate = event.getRawAudioData().getSampleRate();
           for (auto* encoder : encoders_with_audio) {
             if (encoder && encoder->writer) {
-              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000);
+              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000, sample_rate);
+              encoder->audio_initialized = true;
             }
           }
         }
diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index f954ccca886036..960b5a14c1f503 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -5,7 +5,7 @@
 #include "common/swaglog.h"
 #include "common/util.h"
 
-VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio)
+VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec)
   : remuxing(remuxing) {
   vid_path = util::string_format("%s/%s", path, filename);
   lock_path = util::string_format("%s/%s.lock", path, filename);
@@ -41,45 +41,6 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
     this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL);
     assert(this->out_stream);
 
-    if (include_audio) {
-      assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
-      const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
-      assert(audio_avcodec);
-      this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
-      assert(this->audio_codec_ctx);
-      this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-      this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
-      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-      av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
-      #else
-      this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
-      #endif
-      this->audio_codec_ctx->bit_rate = 32000;
-      this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-      this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
-      int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
-      assert(err >= 0);
-      av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
-
-      this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
-      assert(this->audio_stream);
-      err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
-      assert(err >= 0);
-
-      this->audio_frame = av_frame_alloc();
-      assert(this->audio_frame);
-      this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
-      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-      av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
-      #else
-      this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
-      #endif
-      this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
-      this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
-      err = av_frame_get_buffer(this->audio_frame, 0);
-      assert(err >= 0);
-    }
-
     int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE);
     assert(err >= 0);
 
@@ -89,6 +50,45 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
   }
 }
 
+void VideoWriter::initialize_audio(int sample_rate) {
+  assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
+  const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
+  assert(audio_avcodec);
+  this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
+  assert(this->audio_codec_ctx);
+  this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+  this->audio_codec_ctx->sample_rate = sample_rate;
+  #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+  av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
+  #else
+  this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
+  #endif
+  this->audio_codec_ctx->bit_rate = 32000;
+  this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+  this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
+  int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
+  assert(err >= 0);
+  av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
+
+  this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
+  assert(this->audio_stream);
+  err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
+  assert(err >= 0);
+
+  this->audio_frame = av_frame_alloc();
+  assert(this->audio_frame);
+  this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
+  #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+  av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
+  #else
+  this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
+  #endif
+  this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
+  this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
+  err = av_frame_get_buffer(this->audio_frame, 0);
+  assert(err >= 0);
+}
+
 void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe) {
   if (of && data) {
     size_t written = util::safe_fwrite(data, 1, len, of);
@@ -106,9 +106,9 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
       }
       int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx);
       assert(err >= 0);
-      err = avformat_write_header(ofmt_ctx, NULL);
-      assert(err >= 0);
-    } else {
+        err = avformat_write_header(ofmt_ctx, NULL);
+        assert(err >= 0);
+      } else {
       // input timestamps are in microseconds
       AVRational in_timebase = {1, 1000000};
 
@@ -135,8 +135,13 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
   }
 }
 
-void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
-  if (!remuxing || !audio_codec_ctx) return;
+void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int sample_rate) {
+  if (!remuxing) return;
+  if (!audio_initialized) {
+    initialize_audio(sample_rate);
+    audio_initialized = true;
+  }
+  if (!audio_codec_ctx) return;
 
   // sync logMonoTime of first audio packet with the timestampEof of first video packet
   if (audio_pts == 0) {
@@ -174,6 +179,7 @@ void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) {
       if (err < 0) {
         LOGW("AUDIO: Write frame failed - error: %d", err);
       }
+      av_packet_unref(pkt);
     }
     av_packet_free(&pkt);
   } else {
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 724acf0a3877c2..09f190575e5bdd 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -12,13 +12,14 @@ extern "C" {
 
 class VideoWriter {
 public:
-  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio);
+  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec);
   void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
-  void write_audio(uint8_t *data, int len, long long timestamp);
+  void write_audio(uint8_t *data, int len, long long timestamp, int sample_rate);
 
   ~VideoWriter();
 
 private:
+  void initialize_audio(int sample_rate);
   void encode_and_write_audio_frame(AVFrame* frame);
 
   std::string vid_path, lock_path;
@@ -28,6 +29,7 @@ class VideoWriter {
   AVFormatContext *ofmt_ctx;
   AVStream *out_stream;
 
+  bool audio_initialized = false;
   AVStream *audio_stream = nullptr;
   AVCodecContext *audio_codec_ctx = nullptr;
   AVFrame *audio_frame = nullptr;

From 5f6646b35c8a00015a5f105f7c38c5940a518e83 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 21:11:55 -0700
Subject: [PATCH 12/16] update comments

---
 system/loggerd/loggerd.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
index de8e186e8942bc..144ab9f34955e2 100644
--- a/system/loggerd/loggerd.cc
+++ b/system/loggerd/loggerd.cc
@@ -79,7 +79,6 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc
         LOGW("%s: dropped %d non iframe packets before init", encoder_info.publish_name, re.dropped_frames);
         re.dropped_frames = 0;
       }
-      // if we aren't actually recording, don't create the writer
       if (encoder_info.record) {
         // write the header
         auto header = edata.getHeader();
@@ -135,6 +134,7 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
 
     // if this is a new segment, we close any possible old segments, move to the new, and process any queued packets
     if (re.current_segment != s->logger.segment()) {
+      // if we aren't actually recording, don't create the writer
       if (encoder_info.record) {
         assert(encoder_info.filename != NULL);
         re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
@@ -145,9 +145,9 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
       }
       re.current_segment = s->logger.segment();
       re.marked_ready_to_rotate = false;
-      // we are in this segment now, process any queued messages before this one
     }
     if (re.audio_initialized || !encoder_info.include_audio) {
+      // we are in this segment now, process any queued messages before this one
       if (!re.q.empty()) {
         for (auto qmsg : re.q) {
           capnp::FlatArrayMessageReader reader({(capnp::word *)qmsg->getData(), qmsg->getSize() / sizeof(capnp::word)});

From 6b751f8d51588d5cf643b7a9ee56a8f6b37f988e Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 21:53:08 -0700
Subject: [PATCH 13/16] ensure header is written before writing audio

---
 system/loggerd/video_writer.cc | 9 ++++++---
 system/loggerd/video_writer.h  | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 960b5a14c1f503..e90fbd2c97d64e 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -106,9 +106,11 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
       }
       int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx);
       assert(err >= 0);
-        err = avformat_write_header(ofmt_ctx, NULL);
-        assert(err >= 0);
-      } else {
+      // if there is an audio stream, it must be intialized before this point
+      err = avformat_write_header(ofmt_ctx, NULL);
+      assert(err >= 0);
+      header_written = true;
+    } else {
       // input timestamps are in microseconds
       AVRational in_timebase = {1, 1000000};
 
@@ -142,6 +144,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s
     audio_initialized = true;
   }
   if (!audio_codec_ctx) return;
+  if (!header_written) return; // header not written yet, skip processing audio frame
 
   // sync logMonoTime of first audio packet with the timestampEof of first video packet
   if (audio_pts == 0) {
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
index 09f190575e5bdd..25e6484d58e8c5 100644
--- a/system/loggerd/video_writer.h
+++ b/system/loggerd/video_writer.h
@@ -30,6 +30,7 @@ class VideoWriter {
   AVStream *out_stream;
 
   bool audio_initialized = false;
+  bool header_written = false;
   AVStream *audio_stream = nullptr;
   AVCodecContext *audio_codec_ctx = nullptr;
   AVFrame *audio_frame = nullptr;

From 95d0b5a78fe2253fe59a4e5659a0995e0ec140d6 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 22:09:29 -0700
Subject: [PATCH 14/16] buffer audio frame but do not process before header
 written

---
 system/loggerd/video_writer.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index e90fbd2c97d64e..43d66626392851 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -144,8 +144,6 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s
     audio_initialized = true;
   }
   if (!audio_codec_ctx) return;
-  if (!header_written) return; // header not written yet, skip processing audio frame
-
   // sync logMonoTime of first audio packet with the timestampEof of first video packet
   if (audio_pts == 0) {
     audio_pts = (timestamp * audio_codec_ctx->sample_rate) / 1000000ULL;
@@ -160,6 +158,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s
   std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size,
                 [](int16_t sample) { return sample * normalizer; });
 
+  if (!header_written) return; // header not written yet, process audio frame after header is written
   while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
     audio_frame->pts = audio_pts;
     float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);

From 6f36b16c961796d91b010e4472da41a6942fe8db Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 22:50:44 -0700
Subject: [PATCH 15/16] handle buffer overflow now that we are using as an
 actual buffer

---
 system/loggerd/video_writer.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 43d66626392851..64fc170cef4cf9 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -153,6 +153,16 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s
   const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data);
   int sample_count = len / sizeof(int16_t);
   constexpr float normalizer = 1.0f / 32768.0f;
+
+  const size_t max_buffer_size = sample_rate * 10; // 10 seconds
+  if (audio_buffer.size() + sample_count > max_buffer_size) {
+    size_t samples_to_drop = (audio_buffer.size() + sample_count) - max_buffer_size;
+    LOGE("Audio buffer overflow, dropping %zu oldest samples", samples_to_drop);
+    audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + samples_to_drop);
+    audio_pts += samples_to_drop;
+  }
+
+  // Add new samples to the buffer
   const size_t original_size = audio_buffer.size();
   audio_buffer.resize(original_size + sample_count);
   std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size,
@@ -162,7 +172,7 @@ void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int s
   while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
     audio_frame->pts = audio_pts;
     float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
-    std::copy(audio_buffer.begin(),  audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples);
+    std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples);
     audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size);
     encode_and_write_audio_frame(audio_frame);
   }

From cf6c3073c35243912bbb972b57d057cd0d2b4711 Mon Sep 17 00:00:00 2001
From: "Quantizr (Jimmy)" <9859727+Quantizr@users.noreply.github.com>
Date: Thu, 3 Jul 2025 22:54:17 -0700
Subject: [PATCH 16/16] spelling

---
 system/loggerd/video_writer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
index 64fc170cef4cf9..68e870982f3d3c 100644
--- a/system/loggerd/video_writer.cc
+++ b/system/loggerd/video_writer.cc
@@ -106,7 +106,7 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
       }
       int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx);
       assert(err >= 0);
-      // if there is an audio stream, it must be intialized before this point
+      // if there is an audio stream, it must be initialized before this point
       err = avformat_write_header(ofmt_ctx, NULL);
       assert(err >= 0);
       header_written = true;