Skip to content

Commit a4e4a8a

Browse files
authored
include audio in qcamera.ts (#35608)
* encode/store audio as part of video file * better match write_audio() with write() * handle different FFmpeg versions, flush audio encoder, suppress encoder QAvg/info messages * use audio_buffer.size() instead of keeping track of size separately * no more for loops * save to qcam and rlog * assert audio support check * microphone --> soundPressure, audioData --> rawAudioData * deque much more efficient if buffer ever >> frame_size, ~ same performance for defaults * cleanup and fix time scaling * initialize audio separately and pass sample_rate in * update comments * ensure header is written before writing audio * buffer audio frame but do not process before header written * handle buffer overflow now that we are using as an actual buffer * spelling
1 parent c807ecd commit a4e4a8a

File tree

4 files changed

+170
-10
lines changed

4 files changed

+170
-10
lines changed

system/loggerd/loggerd.cc

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct RemoteEncoder {
6262
bool recording = false;
6363
bool marked_ready_to_rotate = false;
6464
bool seen_first_packet = false;
65+
bool audio_initialized = false;
6566
};
6667

6768
size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEncoder &re, const EncoderInfo &encoder_info) {
@@ -78,12 +79,7 @@ size_t write_encode_data(LoggerdState *s, cereal::Event::Reader event, RemoteEnc
7879
LOGW("%s: dropped %d non iframe packets before init", encoder_info.publish_name, re.dropped_frames);
7980
re.dropped_frames = 0;
8081
}
81-
// if we aren't actually recording, don't create the writer
8282
if (encoder_info.record) {
83-
assert(encoder_info.filename != NULL);
84-
re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
85-
encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
86-
edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType()));
8783
// write the header
8884
auto header = edata.getHeader();
8985
re.writer->write((uint8_t *)header.begin(), header.size(), idx.getTimestampEof() / 1000, true, false);
@@ -138,12 +134,19 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
138134

139135
// if this is a new segment, we close any possible old segments, move to the new, and process any queued packets
140136
if (re.current_segment != s->logger.segment()) {
141-
if (re.recording) {
142-
re.writer.reset();
137+
// if we aren't actually recording, don't create the writer
138+
if (encoder_info.record) {
139+
assert(encoder_info.filename != NULL);
140+
re.writer.reset(new VideoWriter(s->logger.segmentPath().c_str(),
141+
encoder_info.filename, idx.getType() != cereal::EncodeIndex::Type::FULL_H_E_V_C,
142+
edata.getWidth(), edata.getHeight(), encoder_info.fps, idx.getType()));
143143
re.recording = false;
144+
re.audio_initialized = false;
144145
}
145146
re.current_segment = s->logger.segment();
146147
re.marked_ready_to_rotate = false;
148+
}
149+
if (re.audio_initialized || !encoder_info.include_audio) {
147150
// we are in this segment now, process any queued messages before this one
148151
if (!re.q.empty()) {
149152
for (auto qmsg : re.q) {
@@ -153,9 +156,14 @@ int handle_encoder_msg(LoggerdState *s, Message *msg, std::string &name, struct
153156
}
154157
re.q.clear();
155158
}
159+
bytes_count += write_encode_data(s, event, re, encoder_info);
160+
delete msg;
161+
} else if (re.q.size() > MAIN_FPS*10) {
162+
LOGE_100("%s: dropping frame waiting for audio initialization, queue is too large", name.c_str());
163+
delete msg;
164+
} else {
165+
re.q.push_back(msg); // queue up all the new segment messages, they go in after audio is initialized
156166
}
157-
bytes_count += write_encode_data(s, event, re, encoder_info);
158-
delete msg;
159167
} else if (offset_segment_num > s->logger.segment()) {
160168
// encoderd packet has a newer segment, this means encoderd has rolled over
161169
if (!re.marked_ready_to_rotate) {
@@ -214,7 +222,7 @@ void loggerd_thread() {
214222
typedef struct ServiceState {
215223
std::string name;
216224
int counter, freq;
217-
bool encoder, user_flag;
225+
bool encoder, user_flag, record_audio;
218226
} ServiceState;
219227
std::unordered_map<SubSocket*, ServiceState> service_state;
220228
std::unordered_map<SubSocket*, struct RemoteEncoder> remote_encoders;
@@ -239,6 +247,7 @@ void loggerd_thread() {
239247
.freq = it.decimation,
240248
.encoder = encoder,
241249
.user_flag = it.name == "userFlag",
250+
.record_audio = record_audio,
242251
};
243252
}
244253
}
@@ -249,13 +258,21 @@ void loggerd_thread() {
249258
Params().put("CurrentRoute", s.logger.routeName());
250259

251260
std::map<std::string, EncoderInfo> encoder_infos_dict;
261+
std::vector<RemoteEncoder*> encoders_with_audio;
252262
for (const auto &cam : cameras_logged) {
253263
for (const auto &encoder_info : cam.encoder_infos) {
254264
encoder_infos_dict[encoder_info.publish_name] = encoder_info;
255265
s.max_waiting++;
256266
}
257267
}
258268

269+
for (auto &[sock, service] : service_state) {
270+
auto it = encoder_infos_dict.find(service.name);
271+
if (it != encoder_infos_dict.end() && it->second.include_audio) {
272+
encoders_with_audio.push_back(&remote_encoders[sock]);
273+
}
274+
}
275+
259276
uint64_t msg_count = 0, bytes_count = 0;
260277
double start_ts = millis_since_boot();
261278
while (!do_exit) {
@@ -273,6 +290,20 @@ void loggerd_thread() {
273290
Message *msg = nullptr;
274291
while (!do_exit && (msg = sock->receive(true))) {
275292
const bool in_qlog = service.freq != -1 && (service.counter++ % service.freq == 0);
293+
294+
if (service.record_audio) {
295+
capnp::FlatArrayMessageReader cmsg(kj::ArrayPtr<capnp::word>((capnp::word *)msg->getData(), msg->getSize() / sizeof(capnp::word)));
296+
auto event = cmsg.getRoot<cereal::Event>();
297+
auto audio_data = event.getRawAudioData().getData();
298+
auto sample_rate = event.getRawAudioData().getSampleRate();
299+
for (auto* encoder : encoders_with_audio) {
300+
if (encoder && encoder->writer) {
301+
encoder->writer->write_audio((uint8_t*)audio_data.begin(), audio_data.size(), event.getLogMonoTime() / 1000, sample_rate);
302+
encoder->audio_initialized = true;
303+
}
304+
}
305+
}
306+
276307
if (service.encoder) {
277308
s.last_camera_seen_tms = millis_since_boot();
278309
bytes_count += handle_encoder_msg(&s, msg, service.name, remote_encoders[sock], encoder_infos_dict[service.name]);

system/loggerd/loggerd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class EncoderInfo {
3535
const char *thumbnail_name = NULL;
3636
const char *filename = NULL;
3737
bool record = true;
38+
bool include_audio = false;
3839
int frame_width = -1;
3940
int frame_height = -1;
4041
int fps = MAIN_FPS;
@@ -106,6 +107,7 @@ const EncoderInfo qcam_encoder_info = {
106107
.encode_type = cereal::EncodeIndex::Type::QCAMERA_H264,
107108
.frame_width = 526,
108109
.frame_height = 330,
110+
.include_audio = Params().getBool("RecordAudio"),
109111
INIT_ENCODE_FUNCTIONS(QRoadEncode),
110112
};
111113

system/loggerd/video_writer.cc

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,45 @@ VideoWriter::VideoWriter(const char *path, const char *filename, bool remuxing,
5050
}
5151
}
5252

53+
void VideoWriter::initialize_audio(int sample_rate) {
54+
assert(this->ofmt_ctx->oformat->audio_codec != AV_CODEC_ID_NONE); // check output format supports audio streams
55+
const AVCodec *audio_avcodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
56+
assert(audio_avcodec);
57+
this->audio_codec_ctx = avcodec_alloc_context3(audio_avcodec);
58+
assert(this->audio_codec_ctx);
59+
this->audio_codec_ctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
60+
this->audio_codec_ctx->sample_rate = sample_rate;
61+
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+
62+
av_channel_layout_default(&this->audio_codec_ctx->ch_layout, 1);
63+
#else
64+
this->audio_codec_ctx->channel_layout = AV_CH_LAYOUT_MONO;
65+
#endif
66+
this->audio_codec_ctx->bit_rate = 32000;
67+
this->audio_codec_ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
68+
this->audio_codec_ctx->time_base = (AVRational){1, audio_codec_ctx->sample_rate};
69+
int err = avcodec_open2(this->audio_codec_ctx, audio_avcodec, NULL);
70+
assert(err >= 0);
71+
av_log_set_level(AV_LOG_WARNING); // hide "QAvg" info msgs at the end of every segment
72+
73+
this->audio_stream = avformat_new_stream(this->ofmt_ctx, NULL);
74+
assert(this->audio_stream);
75+
err = avcodec_parameters_from_context(this->audio_stream->codecpar, this->audio_codec_ctx);
76+
assert(err >= 0);
77+
78+
this->audio_frame = av_frame_alloc();
79+
assert(this->audio_frame);
80+
this->audio_frame->format = this->audio_codec_ctx->sample_fmt;
81+
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) // FFmpeg 5.1+
82+
av_channel_layout_copy(&this->audio_frame->ch_layout, &this->audio_codec_ctx->ch_layout);
83+
#else
84+
this->audio_frame->channel_layout = this->audio_codec_ctx->channel_layout;
85+
#endif
86+
this->audio_frame->sample_rate = this->audio_codec_ctx->sample_rate;
87+
this->audio_frame->nb_samples = this->audio_codec_ctx->frame_size;
88+
err = av_frame_get_buffer(this->audio_frame, 0);
89+
assert(err >= 0);
90+
}
91+
5392
void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe) {
5493
if (of && data) {
5594
size_t written = util::safe_fwrite(data, 1, len, of);
@@ -67,8 +106,10 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
67106
}
68107
int err = avcodec_parameters_from_context(out_stream->codecpar, codec_ctx);
69108
assert(err >= 0);
109+
// if there is an audio stream, it must be initialized before this point
70110
err = avformat_write_header(ofmt_ctx, NULL);
71111
assert(err >= 0);
112+
header_written = true;
72113
} else {
73114
// input timestamps are in microseconds
74115
AVRational in_timebase = {1, 1000000};
@@ -77,6 +118,7 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
77118
av_init_packet(&pkt);
78119
pkt.data = data;
79120
pkt.size = len;
121+
pkt.stream_index = this->out_stream->index;
80122

81123
enum AVRounding rnd = static_cast<enum AVRounding>(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
82124
pkt.pts = pkt.dts = av_rescale_q_rnd(timestamp, in_timebase, ofmt_ctx->streams[0]->time_base, rnd);
@@ -95,11 +137,80 @@ void VideoWriter::write(uint8_t *data, int len, long long timestamp, bool codecc
95137
}
96138
}
97139

140+
void VideoWriter::write_audio(uint8_t *data, int len, long long timestamp, int sample_rate) {
141+
if (!remuxing) return;
142+
if (!audio_initialized) {
143+
initialize_audio(sample_rate);
144+
audio_initialized = true;
145+
}
146+
if (!audio_codec_ctx) return;
147+
// sync logMonoTime of first audio packet with the timestampEof of first video packet
148+
if (audio_pts == 0) {
149+
audio_pts = (timestamp * audio_codec_ctx->sample_rate) / 1000000ULL;
150+
}
151+
152+
// convert s16le samples to fltp and add to buffer
153+
const int16_t *raw_samples = reinterpret_cast<const int16_t*>(data);
154+
int sample_count = len / sizeof(int16_t);
155+
constexpr float normalizer = 1.0f / 32768.0f;
156+
157+
const size_t max_buffer_size = sample_rate * 10; // 10 seconds
158+
if (audio_buffer.size() + sample_count > max_buffer_size) {
159+
size_t samples_to_drop = (audio_buffer.size() + sample_count) - max_buffer_size;
160+
LOGE("Audio buffer overflow, dropping %zu oldest samples", samples_to_drop);
161+
audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + samples_to_drop);
162+
audio_pts += samples_to_drop;
163+
}
164+
165+
// Add new samples to the buffer
166+
const size_t original_size = audio_buffer.size();
167+
audio_buffer.resize(original_size + sample_count);
168+
std::transform(raw_samples, raw_samples + sample_count, audio_buffer.begin() + original_size,
169+
[](int16_t sample) { return sample * normalizer; });
170+
171+
if (!header_written) return; // header not written yet, process audio frame after header is written
172+
while (audio_buffer.size() >= audio_codec_ctx->frame_size) {
173+
audio_frame->pts = audio_pts;
174+
float *f_samples = reinterpret_cast<float*>(audio_frame->data[0]);
175+
std::copy(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size, f_samples);
176+
audio_buffer.erase(audio_buffer.begin(), audio_buffer.begin() + audio_codec_ctx->frame_size);
177+
encode_and_write_audio_frame(audio_frame);
178+
}
179+
}
180+
181+
void VideoWriter::encode_and_write_audio_frame(AVFrame* frame) {
182+
if (!remuxing || !audio_codec_ctx) return;
183+
int send_result = avcodec_send_frame(audio_codec_ctx, frame); // encode frame
184+
if (send_result >= 0) {
185+
AVPacket *pkt = av_packet_alloc();
186+
while (avcodec_receive_packet(audio_codec_ctx, pkt) == 0) {
187+
av_packet_rescale_ts(pkt, audio_codec_ctx->time_base, audio_stream->time_base);
188+
pkt->stream_index = audio_stream->index;
189+
190+
int err = av_interleaved_write_frame(ofmt_ctx, pkt); // write encoded frame
191+
if (err < 0) {
192+
LOGW("AUDIO: Write frame failed - error: %d", err);
193+
}
194+
av_packet_unref(pkt);
195+
}
196+
av_packet_free(&pkt);
197+
} else {
198+
LOGW("AUDIO: Failed to send audio frame to encoder: %d", send_result);
199+
}
200+
audio_pts += audio_codec_ctx->frame_size;
201+
}
202+
203+
98204
VideoWriter::~VideoWriter() {
99205
if (this->remuxing) {
206+
if (this->audio_codec_ctx) {
207+
encode_and_write_audio_frame(NULL); // flush encoder
208+
avcodec_free_context(&this->audio_codec_ctx);
209+
}
100210
int err = av_write_trailer(this->ofmt_ctx);
101211
if (err != 0) LOGE("av_write_trailer failed %d", err);
102212
avcodec_free_context(&this->codec_ctx);
213+
if (this->audio_frame) av_frame_free(&this->audio_frame);
103214
err = avio_closep(&this->ofmt_ctx->pb);
104215
if (err != 0) LOGE("avio_closep failed %d", err);
105216
avformat_free_context(this->ofmt_ctx);

system/loggerd/video_writer.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include <string>
4+
#include <deque>
45

56
extern "C" {
67
#include <libavformat/avformat.h>
@@ -13,13 +14,28 @@ class VideoWriter {
1314
public:
1415
VideoWriter(const char *path, const char *filename, bool remuxing, int width, int height, int fps, cereal::EncodeIndex::Type codec);
1516
void write(uint8_t *data, int len, long long timestamp, bool codecconfig, bool keyframe);
17+
void write_audio(uint8_t *data, int len, long long timestamp, int sample_rate);
18+
1619
~VideoWriter();
20+
1721
private:
22+
void initialize_audio(int sample_rate);
23+
void encode_and_write_audio_frame(AVFrame* frame);
24+
1825
std::string vid_path, lock_path;
1926
FILE *of = nullptr;
2027

2128
AVCodecContext *codec_ctx;
2229
AVFormatContext *ofmt_ctx;
2330
AVStream *out_stream;
31+
32+
bool audio_initialized = false;
33+
bool header_written = false;
34+
AVStream *audio_stream = nullptr;
35+
AVCodecContext *audio_codec_ctx = nullptr;
36+
AVFrame *audio_frame = nullptr;
37+
uint64_t audio_pts = 0;
38+
std::deque<float> audio_buffer;
39+
2440
bool remuxing;
2541
};

0 commit comments

Comments
 (0)