initialize audio separately and pass sample_rate in

Quantizr · Quantizr · commit 15694516940a · 2025-07-03T20:57:30.000-07:00
diff --git a/system/loggerd/loggerd.cc b/system/loggerd/loggerd.cc
@@ -62,6 +62,7 @@ struct RemoteEncoder {
   bool recording = false;
   bool marked_ready_to_rotate = false;
   bool seen_first_packet = false;
+  bool audio_initialized = false;
 };
 
 size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEncoder &re, const EncoderInfo &encoder_info) {
@@ -80,11 +81,6 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc
       }
       // if we aren't actually recording, don't create the writer
       if (encoder_info.record) {
-        assert(encoder_info.filename != NULL);
-        re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
-                                        encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
-                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType(),
-                                        encoder_info.include_audio));
         // write the header
         auto header = edata.getHeader();
         re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false);
@@ -139,13 +135,19 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
 
     // if this is a new segment, we close any possible old segments, move to the new, and process any queued packets
     if (re.current_segment != s->logger.segment()) {
-      if (re.recording) {
-        re.writer.reset();
+      if (encoder_info.record) {
+        assert(encoder_info.filename != NULL);
+        re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
+                                        encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
+                                        edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType()));
         re.recording = false;
+        re.audio_initialized = false;
       }
       re.current_segment = s->logger.segment();
       re.marked_ready_to_rotate = false;
       // we are in this segment now, process any queued messages before this one
+    }
+    if (re.audio_initialized || !encoder_info.include_audio) {
       if (!re.q.empty()) {
         for (auto qmsg : re.q) {
           capnp::FlatArrayMessageReader reader({(capnp::word *)qmsg->getData(), qmsg->getSize() / sizeof(capnp::word)});
@@ -154,9 +156,14 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
         }
         re.q.clear();
       }
+      bytes_count += write_encode_data(s, event, re, encoder_info);
+      delete msg;
+    } else if (re.q.size() > MAIN_FPS*10) {
+      LOGE_100("%s: dropping frame waiting for audio initialization, queue is too large", name.c_str());
+      delete msg;
+    } else {
+      re.q.push_back(msg); // queue up all the new segment messages, they go in after audio is initialized
     }
-    bytes_count += write_encode_data(s, event, re, encoder_info);
-    delete msg;
   } else if (offset_segment_num > s->logger.segment()) {
     // encoderd packet has a newer segment, this means encoderd has rolled over
     if (!re.marked_ready_to_rotate) {
@@ -288,9 +295,11 @@ void loggerd_thread() {
           capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
           auto event = cmsg.getRoot<cereal::Event>();
           auto audio_data = event.getRawAudioData().getData();
+          auto sample_rate = event.getRawAudioData().getSampleRate();
           for (auto* encoder : encoders_with_audio) {
             if (encoder && encoder->writer) {
-              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000);
+              encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000, sample_rate);
+              encoder->audio_initialized = true;
             }
           }
         }
diff --git a/system/loggerd/video_writer.cc b/system/loggerd/video_writer.cc
@@ -5,7 +5,7 @@
 #include "common/swaglog.h"
 #include "common/util.h"
 
-VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio)
+VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec)
   : remuxing(remuxing) {
   vid_path = util::string_format("%s/%s", path, filename);
   lock_path = util::string_format("%s/%s.lock", path, filename);
@@ -41,45 +41,6 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
     this->out_stream = avformat_new_stream(this->ofmt_ctx, raw ? avcodec : NULL);
     assert(this->out_stream);
 
-    if (include_audio) {
-      assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
-      const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
-      assert(audio_avcodec);
-      this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
-      assert(this->audio_codec_ctx);
-      this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-      this->audio_codec_ctx->sample_rate = 16000; // from system/micd.py
-      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-      av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
-      #else
-      this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
-      #endif
-      this->audio_codec_ctx->bit_rate = 32000;
-      this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-      this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
-      int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
-      assert(err >= 0);
-      av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
-
-      this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
-      assert(this->audio_stream);
-      err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
-      assert(err >= 0);
-
-      this->audio_frame = av_frame_alloc();
-      assert(this->audio_frame);
-      this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
-      #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
-      av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
-      #else
-      this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
-      #endif
-      this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
-      this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
-      err = av_frame_get_buffer(this->audio_frame, 0);
-      assert(err >= 0);
-    }
-
     int err = avio_open(&this->ofmt_ctx->pb, this->vid_path.c_str(), AVIO_FLAG_WRITE);
     assert(err >= 0);
 
@@ -89,6 +50,45 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
   }
 }
 
+void VideoWriter::initialize_audio(int sample_rate) {
+  assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
+  const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
+  assert(audio_avcodec);
+  this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
+  assert(this->audio_codec_ctx);
+  this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+  this->audio_codec_ctx->sample_rate = sample_rate;
+  #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+  av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
+  #else
+  this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
+  #endif
+  this->audio_codec_ctx->bit_rate = 32000;
+  this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+  this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
+  int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
+  assert(err >= 0);
+  av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
+
+  this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
+  assert(this->audio_stream);
+  err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
+  assert(err >= 0);
+
+  this->audio_frame = av_frame_alloc();
+  assert(this->audio_frame);
+  this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
+  #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100)  // FFmpeg 5.1+
+  av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
+  #else
+  this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
+  #endif
+  this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
+  this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
+  err = av_frame_get_buffer(this->audio_frame, 0);
+  assert(err >= 0);
+}
+
 void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe) {
   if (of && data) {
     size_t written = util::safe_fwrite(data, 1, len, of);
@@ -106,9 +106,9 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
       }
       int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx);
       assert(err >= 0);
-      err = avformat_write_header(ofmt_ctx, NULL);
-      assert(err >= 0);
-    } else {
+        err = avformat_write_header(ofmt_ctx, NULL);
+        assert(err >= 0);
+      } else {
       // input timestamps are in microseconds
       AVRational in_timebase = {1, 1000000};
 
@@ -135,8 +135,13 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
   }
 }
 
-void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp) {
-  if (!remuxing || !audio_codec_ctx) return;
+void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int sample_rate) {
+  if (!remuxing) return;
+  if (!audio_initialized) {
+    initialize_audio(sample_rate);
+    audio_initialized = true;
+  }
+  if (!audio_codec_ctx) return;
 
   // sync logMonoTime of first audio packet with the timestampEof of first video packet
   if (audio_pts == 0) {
@@ -174,6 +179,7 @@ void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) {
       if (err < 0) {
         LOGW("AUDIO: Write frame failed - error: %d", err);
       }
+      av_packet_unref(pkt);
     }
     av_packet_free(&pkt);
   } else {
diff --git a/system/loggerd/video_writer.h b/system/loggerd/video_writer.h
@@ -12,13 +12,14 @@ extern "C" {
 
 class VideoWriter {
 public:
-  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec, bool include_audio);
+  VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec);
   void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
-  void write_audio(uint8_t *data, int len, long long timestamp);
+  void write_audio(uint8_t *data, int len, long long timestamp, int sample_rate);
 
   ~VideoWriter();
 
 private:
+  void initialize_audio(int sample_rate);
   void encode_and_write_audio_frame(AVFrame* frame);
 
   std::string vid_path, lock_path;
@@ -28,6 +29,7 @@ class VideoWriter {
   AVFormatContext *ofmt_ctx;
   AVStream *out_stream;
 
+  bool audio_initialized = false;
   AVStream *audio_stream = nullptr;
   AVCodecContext *audio_codec_ctx = nullptr;
   AVFrame *audio_frame = nullptr;