From 5866ba1e383bf51e8e34ae0b0f93c6156c346133 Mon Sep 17 00:00:00 2001
From: Carl Brasic <cbrasic@gmail.com>
Date: Wed, 17 Aug 2022 16:28:24 -0500
Subject: [PATCH] Return binary git paths, not potentially invalid UTF8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Git paths have no inherent encoding, they are opaque binary strings that
can be in any encoding. The macro `rb_str_new_utf8` is used in places to
convert raw C strings representing paths (and other things) into ruby
strings tagged with UTF8 encoding.

This is not safe, since the macro simply copies bytes and tags the
string with the specified encoding; it does not do any validity checking
or transcoding. Thus it can easily create an invalid string (i.e. one
for which `String#valid_encoding?` is false) if the repo contains files
whose paths are multibyte strings in encodings other than UTF8. These
strings are poisoned and difficult to work with: they can't be compared
safely because of the semantics of ruby strings and they often can't be
concatenated to a larger output buffer for display (which will attempt
to transcode to the output buffer's native encoding, usually UTF8). The
comparison issue hit us a few times at GitHub.

More detail: in ruby, string equality for multibyte strings is defined
as bytewise equality *plus* encoding. For convenience, this is
unfortunately not enforced for ASCII strings, so it often becomes a bomb
that only gets triggered when you pass multibyte data through your
application.

So even though a binary-encoded "hello" is equal to the UTF8 equivalent,

    "\x68\x65\x6C\x6C\x6F".b.bytes == "hello".bytes
    # => true
    "\x68\x65\x6C\x6C\x6F".b == "hello"
    # => true

the same is not true for non-ASCII text:

    "\xE6\x97\xA5\xE6\x9C\xAC".b.bytes == "日本".bytes
    # => true
    "\xE6\x^C\xA5\xE6\x9C\xAC".b == "日本"
    # => false

One possible fix is to transcode to UTF-8. But this is a bad idea since
the domain of possible git paths includes many binary strings that are
not representable in UTF8 encoding. Better to acknowledge reality and
use an encoding which matches the actual characteristics of this data so
clients can handle it how they choose.

Note that this PR could go farther and do the same thing for some other
uses of this macro which are similarly invalid (refs), but I've opted to
keep the fix relatively narrow and focus on paths.
---
 ext/rugged/rugged.c           |  2 +-
 ext/rugged/rugged_index.c     |  2 +-
 ext/rugged/rugged_submodule.c |  2 +-
 ext/rugged/rugged_tree.c      |  2 +-
 test/index_test.rb            | 11 +++++++++++
 5 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/ext/rugged/rugged.c b/ext/rugged/rugged.c
index fecf48ebf..fa99e3f09 100644
--- a/ext/rugged/rugged.c
+++ b/ext/rugged/rugged.c
@@ -530,7 +530,7 @@ VALUE rb_merge_file_result_fromC(const git_merge_file_result *result)
 	VALUE rb_result = rb_hash_new();
 
 	rb_hash_aset(rb_result, CSTR2SYM("automergeable"), result->automergeable ? Qtrue : Qfalse);
-	rb_hash_aset(rb_result, CSTR2SYM("path"),          result->path ? rb_str_new_utf8(result->path) : Qnil);
+	rb_hash_aset(rb_result, CSTR2SYM("path"),          result->path ? rb_str_new2(result->path) : Qnil);
 	rb_hash_aset(rb_result, CSTR2SYM("filemode"),      INT2FIX(result->mode));
 	rb_hash_aset(rb_result, CSTR2SYM("data"),          rb_str_new(result->ptr, result->len));
 
diff --git a/ext/rugged/rugged_index.c b/ext/rugged/rugged_index.c
index 12ca6f452..e17e27e20 100644
--- a/ext/rugged/rugged_index.c
+++ b/ext/rugged/rugged_index.c
@@ -513,7 +513,7 @@ static VALUE rb_git_indexentry_fromC(const git_index_entry *entry)
 
 	rb_entry = rb_hash_new();
 
-	rb_hash_aset(rb_entry, CSTR2SYM("path"), rb_str_new_utf8(entry->path));
+	rb_hash_aset(rb_entry, CSTR2SYM("path"), rb_str_new2(entry->path));
 	rb_hash_aset(rb_entry, CSTR2SYM("oid"), rugged_create_oid(&entry->id));
 
 	rb_hash_aset(rb_entry, CSTR2SYM("dev"), INT2FIX(entry->dev));
diff --git a/ext/rugged/rugged_submodule.c b/ext/rugged/rugged_submodule.c
index 5076a6b30..65de8bbea 100644
--- a/ext/rugged/rugged_submodule.c
+++ b/ext/rugged/rugged_submodule.c
@@ -558,7 +558,7 @@ static VALUE rb_git_submodule_path(VALUE self)
 
 	path = git_submodule_path(submodule);
 
-	return rb_str_new_utf8(path);
+	return rb_str_new2(path);
 }
 
 #define RB_GIT_OID_GETTER(_klass, _attribute) \
diff --git a/ext/rugged/rugged_tree.c b/ext/rugged/rugged_tree.c
index 9d39f8cfa..d09244c83 100644
--- a/ext/rugged/rugged_tree.c
+++ b/ext/rugged/rugged_tree.c
@@ -30,7 +30,7 @@ static VALUE rb_git_treeentry_fromC(const git_tree_entry *entry)
 
 	rb_entry = rb_hash_new();
 
-	rb_hash_aset(rb_entry, CSTR2SYM("name"), rb_str_new_utf8(git_tree_entry_name(entry)));
+	rb_hash_aset(rb_entry, CSTR2SYM("name"), rb_str_new2(git_tree_entry_name(entry)));
 	rb_hash_aset(rb_entry, CSTR2SYM("oid"), rugged_create_oid(git_tree_entry_id(entry)));
 
 	rb_hash_aset(rb_entry, CSTR2SYM("filemode"), INT2FIX(git_tree_entry_filemode(entry)));
diff --git a/test/index_test.rb b/test/index_test.rb
index dff6e44a9..2bbc91b0a 100644
--- a/test/index_test.rb
+++ b/test/index_test.rb
@@ -222,6 +222,17 @@ def test_conflicts
     assert_equal 3, conflicts[1][:theirs][:stage]
   end
 
+  def test_conflict_paths_are_binary_encoded
+    conflicts = @repo.index.conflicts
+
+    assert_equal 2, conflicts.size
+    conflicts.each do |conflict|
+      conflict.each do |type, data|
+        assert_equal Encoding::BINARY, data[:path].encoding
+      end
+    end
+  end
+
   def test_conflict_get
     conflict = @repo.index.conflict_get("conflicts-one.txt")