diff --git a/src/uu/date/src/date.rs b/src/uu/date/src/date.rs index e577d081c49..53e0cd8ab5b 100644 --- a/src/uu/date/src/date.rs +++ b/src/uu/date/src/date.rs @@ -14,6 +14,7 @@ use jiff::tz::{Offset, TimeZone, TimeZoneDatabase}; use jiff::{Timestamp, Zoned}; use std::borrow::Cow; use std::collections::HashMap; +use std::ffi::OsString; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Read, Write, stderr}; use std::path::PathBuf; @@ -52,10 +53,15 @@ const OPT_REFERENCE: &str = "reference"; const OPT_UNIVERSAL: &str = "universal"; const OPT_UNIVERSAL_2: &str = "utc"; +/// Character emitted by `String::from_utf8_lossy` for each ill-formed byte subsequence. +const UNICODE_REPLACEMENT: char = '\u{FFFD}'; + /// Settings for this program, parsed from the command line struct Settings { utc: bool, format: Format, + /// Raw format bytes for Custom format, to preserve non-UTF-8 bytes in output + format_raw: Option>, date_source: DateSource, set_to: Option, debug: bool, @@ -285,7 +291,7 @@ fn parse_military_timezone_with_offset(s: &str) -> Option<(i32, DayDelta)> { pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; - let date_source = if let Some(date_os) = matches.get_one::(OPT_DATE) { + let date_source = if let Some(date_os) = matches.get_one::(OPT_DATE) { // Convert OsString to String, handling invalid UTF-8 with GNU-compatible error let date = date_os.to_str().ok_or_else(|| { let bytes = date_os.as_encoded_bytes(); @@ -307,34 +313,41 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { }; // Check for extra operands (multiple positional arguments) - if let Some(formats) = matches.get_many::(OPT_FORMAT) { - let format_args: Vec<&String> = formats.collect(); + if let Some(formats) = matches.get_many::(OPT_FORMAT) { + let format_args: Vec<&OsString> = formats.collect(); if format_args.len() > 1 { return Err(USimpleError::new( 1, - translate!("date-error-extra-operand", "operand" => format_args[1]), + translate!("date-error-extra-operand", "operand" => format_args[1].to_string_lossy()), )); } } - let format = if let Some(form) = matches.get_one::(OPT_FORMAT) { - if !form.starts_with('+') { + let mut format_raw: Option> = None; + let format = if let Some(form) = matches.get_one::(OPT_FORMAT) { + let raw_bytes = form.as_encoded_bytes(); + if raw_bytes.first() != Some(&b'+') { + let form_lossy = form.to_string_lossy(); // if an optional Format String was found but the user has not provided an input date // GNU prints an invalid date Error if !matches!(date_source, DateSource::Human(_)) { return Err(USimpleError::new( 1, - translate!("date-error-invalid-date", "date" => form), + translate!("date-error-invalid-date", "date" => form_lossy), )); } // If the user did provide an input date with the --date flag and the Format String is // not starting with '+' GNU prints the missing '+' error message return Err(USimpleError::new( 1, - translate!("date-error-format-missing-plus", "arg" => form), + translate!("date-error-format-missing-plus", "arg" => form_lossy), )); } - let form = form[1..].to_string(); + let bytes_after_plus = &raw_bytes[1..]; + if std::str::from_utf8(bytes_after_plus).is_err() { + format_raw = Some(bytes_after_plus.to_vec()); + } + let form = String::from_utf8_lossy(bytes_after_plus).into_owned(); Format::Custom(form) } else if let Some(fmt) = matches .get_many::(OPT_ISO_8601) @@ -381,6 +394,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let settings = Settings { utc, format, + format_raw, date_source, set_to, debug: debug_mode, @@ -544,6 +558,26 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let format_string = make_format_string(&settings); let mut stdout = BufWriter::new(std::io::stdout().lock()); + // Pre-extract non-UTF-8 chunks from the raw format bytes (if any). + // from_utf8_lossy emits one U+FFFD per ill-formed subsequence (WTF-8 spec), + // so we can match them 1:1 when restoring original bytes in the output. + let raw_chunks: Option> = settings.format_raw.as_ref().map(|raw| { + let mut chunks = Vec::new(); + let mut i = 0; + while i < raw.len() { + match std::str::from_utf8(&raw[i..]) { + Ok(_) => break, + Err(e) => { + i += e.valid_up_to(); + let len = e.error_len().unwrap_or(raw.len() - i); + chunks.push(&raw[i..i + len]); + i += len; + } + } + } + chunks + }); + // Format all the dates let config = Config::new().custom(PosixCustom::new()).lenient(true); for date in dates { @@ -562,9 +596,34 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { &config, skip_localization, ) { - Ok(s) => writeln!(stdout, "{s}").map_err(|e| { - USimpleError::new(1, translate!("date-error-write", "error" => e)) - })?, + Ok(s) => { + if let Some(ref chunks) = raw_chunks { + // Restore non-UTF-8 bytes that were replaced with + // U+FFFD by the lossy conversion. strftime passes + // U+FFFD through unchanged. Each FFFD in the output + // corresponds to the next ill-formed byte subsequence + // from the original format string. + let mut chunk_iter = chunks.iter(); + let mut out = Vec::with_capacity(s.len()); + for ch in s.chars() { + if ch == UNICODE_REPLACEMENT { + if let Some(chunk) = chunk_iter.next() { + out.extend_from_slice(chunk); + } + } else { + let mut buf = [0u8; 4]; + out.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes()); + } + } + out.push(b'\n'); + stdout.write_all(&out) + } else { + writeln!(stdout, "{s}") + } + .map_err(|e| { + USimpleError::new(1, translate!("date-error-write", "error" => e)) + })?; + } Err(e) => { let _ = stdout.flush(); return Err(USimpleError::new( @@ -604,7 +663,7 @@ pub fn uu_app() -> Command { .value_name("STRING") .allow_hyphen_values(true) .overrides_with(OPT_DATE) - .value_parser(clap::value_parser!(std::ffi::OsString)) + .value_parser(clap::value_parser!(OsString)) .help(translate!("date-help-date")), ) .arg( @@ -699,7 +758,11 @@ pub fn uu_app() -> Command { .help(translate!("date-help-universal")) .action(ArgAction::SetTrue), ) - .arg(Arg::new(OPT_FORMAT).num_args(0..)) + .arg( + Arg::new(OPT_FORMAT) + .num_args(0..) + .value_parser(clap::value_parser!(OsString)), + ) } fn format_date_with_locale_aware_months( diff --git a/src/uu/date/src/locale.rs b/src/uu/date/src/locale.rs index 8d1c0b0133e..d5fbffbb360 100644 --- a/src/uu/date/src/locale.rs +++ b/src/uu/date/src/locale.rs @@ -88,12 +88,12 @@ cfg_langinfo! { return None; } - let format = CStr::from_ptr(d_t_fmt_ptr).to_str().ok()?; + let format = CStr::from_ptr(d_t_fmt_ptr).to_string_lossy(); if format.is_empty() { return None; } - Some(format.to_string()) + Some(format.into_owned()) } } diff --git a/tests/by-util/test_date.rs b/tests/by-util/test_date.rs index a6139848c08..6f0480ea6b7 100644 --- a/tests/by-util/test_date.rs +++ b/tests/by-util/test_date.rs @@ -2161,6 +2161,37 @@ fn test_locale_day_names() { } } +/// Test that non-UTF-8 format bytes are preserved in output (not replaced +/// with U+FFFD), matching GNU behavior. +#[test] +#[cfg(unix)] +fn test_date_non_utf8_format_preserved() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // Simple case: \xFF should pass through as-is + let fmt_bytes: &[u8] = b"+\xff%m"; + let result = new_ucmd!() + .arg("-d") + .arg("2025-10-11T13:00") + .arg(OsStr::from_bytes(fmt_bytes)) + .succeeds(); + let stdout_bytes = result.stdout(); + assert_eq!(stdout_bytes[0], 0xFF); + assert_eq!(&stdout_bytes[1..3], b"10"); + + // GB18030-encoded "年" (0xC4EA) + "%m" + "月" (0xD4C2) + "%d" + "日" (0xC8D5) + let fmt_bytes: &[u8] = b"+\xc4\xea%m\xd4\xc2%d\xc8\xd5"; + let result = new_ucmd!() + .arg("-d") + .arg("2025-10-11T13:00") + .arg(OsStr::from_bytes(fmt_bytes)) + .succeeds(); + let stdout_bytes = result.stdout(); + // \xC4\xEA + "10" + \xD4\xC2 + "11" + \xC8\xD5 + "\n" + assert_eq!(stdout_bytes, b"\xc4\xea10\xd4\xc211\xc8\xd5\n"); +} + #[test] fn test_percent_percent_not_replaced() { let cases = [