pytorch · NicolasHug · Jul 5, 2025 · Jul 5, 2025 · Jul 5, 2025 · Jul 5, 2025
diff --git a/src/torchcodec/_core/AVIOContextHolder.cpp b/src/torchcodec/_core/AVIOContextHolder.cpp
@@ -14,6 +14,7 @@ void AVIOContextHolder::createAVIOContext(
     AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
+    bool isForWriting,
     int bufferSize) {
   TORCH_CHECK(
       bufferSize > 0,
@@ -23,14 +24,18 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
-  TORCH_CHECK(
-      (seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
-      "seek method must be defined, and either write or read must be defined. "
-      "But not both!")
+  TORCH_CHECK(seek != nullptr, "seek method must be defined");
+
+  if (isForWriting) {
+    TORCH_CHECK(write != nullptr, "write method must be defined for writing");
+  } else {
+    TORCH_CHECK(read != nullptr, "read method must be defined for reading");
+  }
+
   avioContext_.reset(avioAllocContext(
       buffer,
       bufferSize,
-      /*write_flag=*/write != nullptr,
+      /*write_flag=*/isForWriting,
       heldData,
       read,
       write,

diff --git a/src/torchcodec/_core/AVIOContextHolder.h b/src/torchcodec/_core/AVIOContextHolder.h
@@ -51,6 +51,7 @@ class AVIOContextHolder {
       AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
+      bool isForWriting,
       int bufferSize = defaultBufferSize);
 
  private:

diff --git a/src/torchcodec/_core/AVIOFileLikeContext.cpp b/src/torchcodec/_core/AVIOFileLikeContext.cpp
@@ -9,21 +9,29 @@
 
 namespace facebook::torchcodec {
 
-AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
+AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike, bool isForWriting)
     : fileLike_{UniquePyObject(new py::object(fileLike))} {
   {
     // TODO: Is it necessary to acquire the GIL here? Is it maybe even
     // harmful? At the moment, this is only called from within a pybind
     // function, and pybind guarantees we have the GIL.
     py::gil_scoped_acquire gil;
-    TORCH_CHECK(
-        py::hasattr(fileLike, "read"),
-        "File like object must implement a read method.");
+
+    if (isForWriting) {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "write"),
+          "File like object must implement a write method for writing.");
+    } else {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "read"),
+          "File like object must implement a read method for reading.");
+    }
+
     TORCH_CHECK(
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, nullptr, &seek, &fileLike_);
+  createAVIOContext(&read, &write, &seek, &fileLike_, isForWriting);
 }
 
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
@@ -77,4 +85,12 @@ int64_t AVIOFileLikeContext::seek(void* opaque, int64_t offset, int whence) {
   return py::cast<int64_t>((*fileLike)->attr("seek")(offset, whence));
 }
 
+int AVIOFileLikeContext::write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto fileLike = static_cast<UniquePyObject*>(opaque);
+  py::gil_scoped_acquire gil;
+  py::bytes bytes_obj(reinterpret_cast<const char*>(buf), buf_size);
+
+  return py::cast<int64_t>((*fileLike)->attr("write")(bytes_obj));
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -17,13 +17,16 @@ namespace facebook::torchcodec {
 
 // Enables uers to pass in a Python file-like object. We then forward all read
 // and seek calls back up to the methods on the Python object.
-class AVIOFileLikeContext : public AVIOContextHolder {
+// TODO: explain this. We probably don't want it.
+class __attribute__((visibility("default"))) AVIOFileLikeContext
+    : public AVIOContextHolder {
  public:
-  explicit AVIOFileLikeContext(py::object fileLike);
+  explicit AVIOFileLikeContext(py::object fileLike, bool isForWriting);
 
  private:
   static int read(void* opaque, uint8_t* buf, int buf_size);
   static int64_t seek(void* opaque, int64_t offset, int whence);
+  static int write(void* opaque, const uint8_t* buf, int buf_size);
 
   // Note that we dynamically allocate the Python object because we need to
   // strictly control when its destructor is called. We must hold the GIL

diff --git a/src/torchcodec/_core/AVIOTensorContext.cpp b/src/torchcodec/_core/AVIOTensorContext.cpp
@@ -105,12 +105,14 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
   TORCH_CHECK(data.numel() > 0, "data must not be empty");
   TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
   TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
-  createAVIOContext(&read, nullptr, &seek, &tensorContext_);
+  createAVIOContext(
+      &read, nullptr, &seek, &tensorContext_, /*isForWriting=*/false);
 }
 
 AVIOToTensorContext::AVIOToTensorContext()
     : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
-  createAVIOContext(nullptr, &write, &seek, &tensorContext_);
+  createAVIOContext(
+      nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
 }
 
 torch::Tensor AVIOToTensorContext::getOutputTensor() {

diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -66,6 +66,7 @@ function(make_torchcodec_libraries
     set(decoder_sources
         AVIOContextHolder.cpp
         AVIOTensorContext.cpp
+        AVIOFileLikeContext.cpp
         FFMPEGCommon.cpp
         Frame.cpp
         DeviceInterface.cpp
@@ -142,15 +143,6 @@ function(make_torchcodec_libraries
         "${pybind_ops_sources}"
         "${pybind_ops_dependencies}"
     )
-    # pybind11 limits the visibility of symbols in the shared library to prevent
-    # stray initialization of py::objects. The rest of the object code must
-    # match. See:
-    #   https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
-    target_compile_options(
-        ${pybind_ops_library_name}
-        PUBLIC
-      "-fvisibility=hidden"
-    )
     # If we don't make sure this flag is set, we run into segfauls at import
     # time on Mac. See:
     #    https://github.com/pybind/pybind11/issues/3907#issuecomment-1170412764

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -135,10 +135,10 @@ AudioEncoder::AudioEncoder(
     const torch::Tensor& samples,
     int sampleRate,
     std::string_view formatName,
-    std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+    std::unique_ptr<AVIOToTensorContext> avioToTensorContext,
     const AudioStreamOptions& audioStreamOptions)
     : samples_(validateSamples(samples)),
-      avioContextHolder_(std::move(avioContextHolder)) {
+      avioToTensorContext_(std::move(avioToTensorContext)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
   int status = avformat_alloc_output_context2(
@@ -153,7 +153,34 @@ AudioEncoder::AudioEncoder(
       getFFMPEGErrorStringFromErrorCode(status));
   avFormatContext_.reset(avFormatContext);
 
-  avFormatContext_->pb = avioContextHolder_->getAVIOContext();
+  avFormatContext_->pb = avioToTensorContext_->getAVIOContext();
+
+  initializeEncoder(sampleRate, audioStreamOptions);
+}
+
+AudioEncoder::AudioEncoder(
+    const torch::Tensor& samples,
+    int sampleRate,
+    std::string_view formatName,
+    std::unique_ptr<AVIOFileLikeContext> avioFileLikeContext,
+    const AudioStreamOptions& audioStreamOptions)
+    : samples_(validateSamples(samples)),
+      avioFileLikeContext_(std::move(avioFileLikeContext)) {
+  setFFmpegLogLevel();
+  AVFormatContext* avFormatContext = nullptr;
+  int status = avformat_alloc_output_context2(
+      &avFormatContext, nullptr, formatName.data(), nullptr);
+
+  TORCH_CHECK(
+      avFormatContext != nullptr,
+      "Couldn't allocate AVFormatContext for file-like object. ",
+      "Check the desired format? Got format=",
+      formatName,
+      ". ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  avFormatContext_.reset(avFormatContext);
+
+  avFormatContext_->pb = avioFileLikeContext_->getAVIOContext();
 
   initializeEncoder(sampleRate, audioStreamOptions);
 }
@@ -217,10 +244,10 @@ void AudioEncoder::initializeEncoder(
 
 torch::Tensor AudioEncoder::encodeToTensor() {
   TORCH_CHECK(
-      avioContextHolder_ != nullptr,
-      "Cannot encode to tensor, avio context doesn't exist.");
+      avioToTensorContext_ != nullptr,
+      "Cannot encode to tensor, avio tensor context doesn't exist.");
   encode();
-  return avioContextHolder_->getOutputTensor();
+  return avioToTensorContext_->getOutputTensor();
 }
 
 void AudioEncoder::encode() {

diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/types.h>
+#include "src/torchcodec/_core/AVIOFileLikeContext.h"
 #include "src/torchcodec/_core/AVIOTensorContext.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/StreamOptions.h"
@@ -20,13 +21,27 @@ class AudioEncoder {
       int sampleRate,
       std::string_view fileName,
       const AudioStreamOptions& audioStreamOptions);
+
+  // We need one constructor for each type of AVIOContextHolder. We can't have a
+  // single constructor that accepts the base AVIOContextHolder class and hold
+  // that as attribute, because we are calling the getOutputTensor() method on
+  // the AVIOToTensorContext, which is not available in the base class.
   AudioEncoder(
       const torch::Tensor& samples,
       int sampleRate,
       std::string_view formatName,
-      std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+      std::unique_ptr<AVIOToTensorContext> AVIOToTensorContext,
       const AudioStreamOptions& audioStreamOptions);
+
+  AudioEncoder(
+      const torch::Tensor& samples,
+      int sampleRate,
+      std::string_view formatName,
+      std::unique_ptr<AVIOFileLikeContext> AVIOFileLikeContext,
+      const AudioStreamOptions& audioStreamOptions);
+
   void encode();
+
   torch::Tensor encodeToTensor();
 
  private:
@@ -49,8 +64,8 @@ class AudioEncoder {
 
   const torch::Tensor samples_;
 
-  // Stores the AVIOContext for the output tensor buffer.
-  std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
+  std::unique_ptr<AVIOToTensorContext> avioToTensorContext_;
+  std::unique_ptr<AVIOFileLikeContext> avioFileLikeContext_;
 
   bool encodeWasCalled_ = false;
 };

diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -23,6 +23,7 @@
     create_from_file_like,
     create_from_tensor,
     encode_audio_to_file,
+    encode_audio_to_file_like,
     encode_audio_to_tensor,
     get_ffmpeg_library_versions,
     get_frame_at_index,

diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -153,6 +153,59 @@ def create_from_file_like(
     return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
 
 
+def encode_audio_to_file_like(
+    samples: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
+) -> None:
+    """Encode audio samples to a file-like object.
+
+    Args:
+        samples: Audio samples tensor
+        sample_rate: Sample rate in Hz
+        format: Audio format (e.g., "wav", "mp3", "flac")
+        file_like: File-like object that supports write() and seek() methods
+        bit_rate: Optional bit rate for encoding
+        num_channels: Optional number of output channels
+    """
+    assert _pybind_ops is not None
+
+    if samples.dtype != torch.float32:
+        raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
+
+    # We're having the same problem as with the decoder's create_from_file_like:
+    # We should be able to pass a tensor directly, but this leads to a pybind
+    # error. In order to work around this, we pass the pointer to the tensor's
+    # data, and its shape, in order to re-construct it in C++. For this to work:
+    # - the tensor must be float32
+    # - the tensor  must be contiguous, which is why we call contiguous().
+    #   In theory we could avoid this restriction by also passing the strides?
+    # - IMPORTANT: the input samples tensor and its underlying data must be
+    #   alive during the call.
+    #
+    # A more elegant solution would be to cast the tensor into a py::object, but
+    # casting the py::object backk to a tensor in C++ seems to lead to the same
+    # pybing error.
+
+    samples = samples.contiguous()
+    _pybind_ops.encode_audio_to_file_like(
+        samples.data_ptr(),
+        list(samples.shape),
+        sample_rate,
+        format,
+        file_like,
+        bit_rate,
+        num_channels,
+    )
+
+    # This check is useless but it's critical to keep it to ensures that samples
+    # is still alive during the call to encode_audio_to_file_like.
+    assert samples.is_contiguous()
+
+
 # ==============================
 # Abstract impl for the operators. Needed by torch.compile.
 # ==============================

diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp
@@ -10,7 +10,9 @@
 #include <string>
 
 #include "src/torchcodec/_core/AVIOFileLikeContext.h"
+#include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace py = pybind11;
 
@@ -31,15 +33,58 @@ int64_t create_from_file_like(
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  auto avioContextHolder = std::make_unique<AVIOFileLikeContext>(file_like);
+  auto avioContextHolder =
+      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
 
   SingleStreamDecoder* decoder =
       new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
   return reinterpret_cast<int64_t>(decoder);
 }
 
+void encode_audio_to_file_like(
+    int64_t data_ptr,
+    const std::vector<int64_t>& shape,
+    int64_t sample_rate,
+    std::string_view format,
+    py::object file_like,
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt) {
+  // We assume float32 *and* contiguity, this must be enforced by the caller.
+  auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
+  auto samples = torch::from_blob(
+      reinterpret_cast<void*>(data_ptr), shape, tensor_options);
+
+  // TODO Fix implicit int conversion:
+  // https://github.com/pytorch/torchcodec/issues/679
+  // same for sample_rate parameter below
+  AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.bitRate = bit_rate;
+  audioStreamOptions.numChannels = num_channels;
+
+  auto avioContextHolder =
+      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
+
+  AudioEncoder encoder(
+      samples,
+      static_cast<int>(sample_rate),
+      format,
+      std::move(avioContextHolder),
+      audioStreamOptions);
+  encoder.encode();
+}
+
 PYBIND11_MODULE(decoder_core_pybind_ops, m) {
   m.def("create_from_file_like", &create_from_file_like);
+  m.def(
+      "encode_audio_to_file_like",
+      &encode_audio_to_file_like,
+      "data_ptr",
+      "shape",
+      "sample_rate",
+      "format",
+      "file_like",
+      "bit_rate",
+      "num_channels");
 }
 
 } // namespace facebook::torchcodec