From 0762612e7fa68d10a01c2e6d6d1e4183d86f4b1d Mon Sep 17 00:00:00 2001 From: Zheng Wei Tan Date: Tue, 18 Jul 2023 17:14:41 +0200 Subject: [PATCH 1/4] Fix encoding of non-ascii contents written to parameter files. --- .../google/devtools/build/lib/actions/BUILD | 1 + .../build/lib/actions/ParameterFile.java | 10 +++++++- src/test/shell/integration/unicode_test.bzl | 25 +++++++++++++++++++ src/test/shell/integration/unicode_test.sh | 21 ++++++++++++++++ src/test/shell/integration/unicode_test_BUILD | 18 ++++++++++++- 5 files changed, 73 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/actions/BUILD b/src/main/java/com/google/devtools/build/lib/actions/BUILD index a9ad4fde1fae67..b154e2127a2548 100644 --- a/src/main/java/com/google/devtools/build/lib/actions/BUILD +++ b/src/main/java/com/google/devtools/build/lib/actions/BUILD @@ -192,6 +192,7 @@ java_library( "//src/main/java/com/google/devtools/build/lib/util:filetype", "//src/main/java/com/google/devtools/build/lib/util:os", "//src/main/java/com/google/devtools/build/lib/util:shell_escaper", + "//src/main/java/com/google/devtools/build/lib/util:string", "//src/main/java/com/google/devtools/build/lib/util:var_int", "//src/main/java/com/google/devtools/build/lib/util/io", "//src/main/java/com/google/devtools/build/lib/vfs", diff --git a/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java b/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java index 4e86984f77cc32..08c32bd010644a 100644 --- a/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java +++ b/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java @@ -25,6 +25,7 @@ import com.google.devtools.build.lib.util.GccParamFileEscaper; import com.google.devtools.build.lib.util.ShellEscaper; import com.google.devtools.build.lib.vfs.PathFragment; +import com.google.devtools.build.lib.util.StringUtil; import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -33,6 +34,7 @@ import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; +import java.util.stream.StreamSupport; /** * Support for parameter file generation (as used by gcc and other tools, e.g. @@ -119,7 +121,13 @@ private static void writeContent( if (charset.equals(ISO_8859_1)) { writeContentLatin1(outputStream, arguments); } else if (charset.equals(UTF_8)) { - writeContentUtf8(outputStream, arguments); + // Decode potentially already Utf8 strings here, so that they don't get doubly encoded as + // Utf8 again. + ImmutableList args = + StreamSupport.stream(arguments.spliterator(), /* parallel= */ false) + .map(StringUtil::decodeBytestringUtf8) + .collect(ImmutableList.toImmutableList()); + writeContentUtf8(outputStream, args); } else { // Generic charset support OutputStreamWriter out = new OutputStreamWriter(outputStream, charset); diff --git a/src/test/shell/integration/unicode_test.bzl b/src/test/shell/integration/unicode_test.bzl index 301a2faf7507c3..72ec6e4bfb15b5 100644 --- a/src/test/shell/integration/unicode_test.bzl +++ b/src/test/shell/integration/unicode_test.bzl @@ -51,3 +51,28 @@ write_file_rule = rule( "is_executable": attr.bool(), }, ) + +def _run_executable_with_param_file_impl(ctx): + args = ctx.actions.args() + args.use_param_file("%s", use_always = True) + args.add(ctx.attr.content) + ctx.actions.run( + inputs = [], + outputs = [ctx.outputs.out], + arguments = [args, ctx.outputs.out.path], + executable = ctx.executable.executable, + ) + +run_executable_with_param_file_rule = rule( + implementation = _run_executable_with_param_file_impl, + doc = "Writes `content` to a param file and passes the file to the executable", + attrs = { + "out": attr.output(mandatory = True), + "content": attr.string(mandatory = True), + "executable": attr.label( + allow_files = True, + executable = True, + cfg = "exec", + ), + }, +) diff --git a/src/test/shell/integration/unicode_test.sh b/src/test/shell/integration/unicode_test.sh index 7ef3a334c706ac..d4108c86ac6827 100755 --- a/src/test/shell/integration/unicode_test.sh +++ b/src/test/shell/integration/unicode_test.sh @@ -72,4 +72,25 @@ function test_unicode_action_write_content { >>"${TEST_log}" 2>&1 || fail "Output not as expected" } +function test_unicode_action_run_param_file { + local test_name="action_run_param_file" + bazel build --genrule_strategy=local --spawn_strategy=local \ + "//:${test_name}" >& "$TEST_log" \ + || fail "expected build to succeed" + + quoted_unicode_test_expected="'$(cat unicode_test_expected.txt)'" + + echo "Expected: ${quoted_unicode_test_expected}" + + cat_output=$(cat "${PRODUCT_NAME}-bin/${test_name}.out") + assert_equals "${cat_output}" \ + "${quoted_unicode_test_expected}" \ + || fail "Output not as expected" + + param_file_output=$(cat "${PRODUCT_NAME}-bin/${test_name}.out-0.params") + assert_equals "${param_file_output}" \ + "${quoted_unicode_test_expected}" \ + || fail "Output not as expected" +} + run_suite "Integration tests for ${PRODUCT_NAME}'s unicode i/o in actions" \ No newline at end of file diff --git a/src/test/shell/integration/unicode_test_BUILD b/src/test/shell/integration/unicode_test_BUILD index 21c787dfda614f..fa8b334f94ee37 100644 --- a/src/test/shell/integration/unicode_test_BUILD +++ b/src/test/shell/integration/unicode_test_BUILD @@ -1,5 +1,5 @@ # BUILD file for unicode_test -load(":unicode_test.bzl", "run_executable_rule", "write_file_rule") +load(":unicode_test.bzl", "run_executable_rule", "write_file_rule", "run_executable_with_param_file_rule") # In Russian and Bengali: "Down with mojibake! We want unicode!" non_ascii_string = "Долой кракозябры! আমরা ইউনিকোড চাই!" @@ -31,3 +31,19 @@ write_file_rule( content = non_ascii_string, out = "action_write_content.out", ) + +run_executable_with_param_file_rule( + name = "action_run_param_file", + executable = "cat_param_file.sh", + content = non_ascii_string, + out = "action_run_param_file.out", +) + +write_file_rule( + name = "cat_param_file", + content = """#!/bin/bash +cat "$1" >> "$2"; +""", + out = "cat_param_file.sh", + is_executable = True, +) \ No newline at end of file From 58d97e22ae0cbd0f43c5f936236ff88cde9ccc55 Mon Sep 17 00:00:00 2001 From: Zheng Wei Tan Date: Wed, 19 Jul 2023 13:47:31 +0200 Subject: [PATCH 2/4] Try testing on windows. --- src/test/shell/integration/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/shell/integration/BUILD b/src/test/shell/integration/BUILD index 05755b31570c25..a0f967466ce18f 100644 --- a/src/test/shell/integration/BUILD +++ b/src/test/shell/integration/BUILD @@ -873,7 +873,7 @@ sh_test( "@bazel_tools//tools/bash/runfiles", ], # TODO(arostovtsev): figure out how to make this test Windows-compatible. - tags = ["no_windows"], + # tags = ["no_windows"], ) ######################################################################## From a46c53228d933b01ecc7028277ab15d2db21428e Mon Sep 17 00:00:00 2001 From: Zheng Wei Tan Date: Wed, 19 Jul 2023 14:01:58 +0200 Subject: [PATCH 3/4] Remove testing on windows, the newly added test fails on windows as well. --- src/test/shell/integration/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/shell/integration/BUILD b/src/test/shell/integration/BUILD index a0f967466ce18f..05755b31570c25 100644 --- a/src/test/shell/integration/BUILD +++ b/src/test/shell/integration/BUILD @@ -873,7 +873,7 @@ sh_test( "@bazel_tools//tools/bash/runfiles", ], # TODO(arostovtsev): figure out how to make this test Windows-compatible. - # tags = ["no_windows"], + tags = ["no_windows"], ) ######################################################################## From a8d1c7e6d04f1c04fe967556c8c6126acabe9d9d Mon Sep 17 00:00:00 2001 From: Zheng Wei Tan Date: Fri, 21 Jul 2023 09:49:34 +0200 Subject: [PATCH 4/4] Avoid double recoding when dealing with an already utf8 encoded string. --- .../devtools/build/lib/actions/ParameterFile.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java b/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java index 08c32bd010644a..cd64e4de29682e 100644 --- a/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java +++ b/src/main/java/com/google/devtools/build/lib/actions/ParameterFile.java @@ -121,13 +121,7 @@ private static void writeContent( if (charset.equals(ISO_8859_1)) { writeContentLatin1(outputStream, arguments); } else if (charset.equals(UTF_8)) { - // Decode potentially already Utf8 strings here, so that they don't get doubly encoded as - // Utf8 again. - ImmutableList args = - StreamSupport.stream(arguments.spliterator(), /* parallel= */ false) - .map(StringUtil::decodeBytestringUtf8) - .collect(ImmutableList.toImmutableList()); - writeContentUtf8(outputStream, args); + writeContentUtf8(outputStream, arguments); } else { // Generic charset support OutputStreamWriter out = new OutputStreamWriter(outputStream, charset); @@ -178,6 +172,10 @@ private static void writeContentUtf8(OutputStream outputStream, Iterable byte[] bytes = stringUnsafe.getByteArray(line); if (stringUnsafe.getCoder(line) == StringUnsafe.LATIN1 && isAscii(bytes)) { outputStream.write(bytes); + } else if (!StringUtil.decodeBytestringUtf8(line).equals(line)) { + // We successfully decoded line from utf8 - meaning it was already encoded as utf8. + // We do not want to double-encode. + outputStream.write(bytes); } else { ByteBuffer encodedBytes = encoder.encode(CharBuffer.wrap(line)); outputStream.write(