diff --git a/src/uu/numfmt/src/format.rs b/src/uu/numfmt/src/format.rs index 33dc58bc29d..f27926d8744 100644 --- a/src/uu/numfmt/src/format.rs +++ b/src/uu/numfmt/src/format.rs @@ -358,32 +358,56 @@ fn format_string( )) } -fn format_and_print_delimited(s: &str, options: &NumfmtOptions) -> Result<()> { +fn split_bytes<'a>(input: &'a [u8], delim: &'a [u8]) -> impl Iterator { + let mut remainder = Some(input); + std::iter::from_fn(move || { + let input = remainder.take()?; + match input.windows(delim.len()).position(|w| w == delim) { + Some(pos) => { + remainder = Some(&input[pos + delim.len()..]); + Some(&input[..pos]) + } + None => Some(input), + } + }) +} + +pub fn format_and_print_delimited(input: &[u8], options: &NumfmtOptions) -> Result<()> { let delimiter = options.delimiter.as_ref().unwrap(); - let mut output = String::new(); + let mut output: Vec = Vec::new(); + let eol = if options.zero_terminated { + b'\0' + } else { + b'\n' + }; - for (n, field) in (1..).zip(s.split(delimiter)) { + for (n, field) in (1..).zip(split_bytes(input, delimiter)) { let field_selected = uucore::ranges::contain(&options.fields, n); // add delimiter before second and subsequent fields if n > 1 { - output.push_str(delimiter); + output.extend_from_slice(delimiter); } if field_selected { - output.push_str(&format_string(field.trim_start(), options, None)?); + // Field must be valid UTF-8 for numeric conversion + let field_str = std::str::from_utf8(field) + .map_err(|_| translate!("numfmt-error-invalid-number", "input" => String::from_utf8_lossy(field).into_owned().quote()))? + .trim_start(); + let formatted = format_string(field_str, options, None)?; + output.extend_from_slice(formatted.as_bytes()); } else { // add unselected field without conversion - output.push_str(field); + output.extend_from_slice(field); } } - println!("{output}"); + output.push(eol); + std::io::Write::write_all(&mut std::io::stdout(), &output).map_err(|e| e.to_string())?; Ok(()) } - -fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> { +pub fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> { let mut output = String::new(); for (n, (prefix, field)) in (1..).zip(WhitespaceSplitter { s: Some(s) }) { @@ -428,18 +452,6 @@ fn format_and_print_whitespace(s: &str, options: &NumfmtOptions) -> Result<()> { Ok(()) } -/// Format a line of text according to the selected options. -/// -/// Given a line of text `s`, split the line into fields, transform and format -/// any selected numeric fields, and print the result to stdout. Fields not -/// selected for conversion are passed through unmodified. -pub fn format_and_print(s: &str, options: &NumfmtOptions) -> Result<()> { - match &options.delimiter { - Some(_) => format_and_print_delimited(s, options), - None => format_and_print_whitespace(s, options), - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/uu/numfmt/src/numfmt.rs b/src/uu/numfmt/src/numfmt.rs index 81e8ab9cd0a..d5ce81138e7 100644 --- a/src/uu/numfmt/src/numfmt.rs +++ b/src/uu/numfmt/src/numfmt.rs @@ -4,10 +4,11 @@ // file that was distributed with this source code. use crate::errors::*; -use crate::format::format_and_print; +use crate::format::{format_and_print_delimited, format_and_print_whitespace}; use crate::options::*; use crate::units::{Result, Unit}; -use clap::{Arg, ArgAction, ArgMatches, Command, parser::ValueSource}; +use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser, parser::ValueSource}; +use std::ffi::OsString; use std::io::{BufRead, Error, Write}; use std::result::Result as StdResult; use std::str::FromStr; @@ -15,6 +16,7 @@ use std::str::FromStr; use units::{IEC_BASES, SI_BASES}; use uucore::display::Quotable; use uucore::error::UResult; +use uucore::os_str_as_bytes; use uucore::translate; use uucore::parser::shortcut_value_parser::ShortcutValueParser; @@ -26,7 +28,7 @@ pub mod format; pub mod options; mod units; -fn handle_args<'a>(args: impl Iterator, options: &NumfmtOptions) -> UResult<()> { +fn handle_args<'a>(args: impl Iterator, options: &NumfmtOptions) -> UResult<()> { for l in args { format_and_handle_validation(l, options)?; } @@ -37,40 +39,45 @@ fn handle_buffer(input: R, options: &NumfmtOptions) -> UResult<()> where R: BufRead, { - if options.zero_terminated { - handle_buffer_iterator( - input - .split(0) - // FIXME: This panics on UTF8 decoding, but this util in general doesn't handle - // invalid UTF8 - .map(|bytes| Ok(String::from_utf8(bytes?).unwrap())), - options, - ) - } else { - handle_buffer_iterator(input.lines(), options) - } + let terminator = if options.zero_terminated { 0u8 } else { b'\n' }; + handle_buffer_iterator(input.split(terminator), options, terminator) } fn handle_buffer_iterator( - iter: impl Iterator>, + iter: impl Iterator, Error>>, options: &NumfmtOptions, + terminator: u8, ) -> UResult<()> { - let eol = if options.zero_terminated { '\0' } else { '\n' }; for (idx, line_result) in iter.enumerate() { match line_result { Ok(line) if idx < options.header => { - print!("{line}{eol}"); + std::io::stdout().write_all(&line)?; + std::io::stdout().write_all(&[terminator])?; Ok(()) } - Ok(line) => format_and_handle_validation(line.as_ref(), options), + Ok(line) => format_and_handle_validation(&line, options), Err(err) => return Err(Box::new(NumfmtError::IoError(err.to_string()))), }?; } Ok(()) } -fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UResult<()> { - let handled_line = format_and_print(input_line, options); +fn format_and_handle_validation(input_line: &[u8], options: &NumfmtOptions) -> UResult<()> { + let eol = if options.zero_terminated { + b'\0' + } else { + b'\n' + }; + + let handled_line = if options.delimiter.is_some() { + format_and_print_delimited(input_line, options) + } else { + // Whitespace mode requires valid UTF-8 + match std::str::from_utf8(input_line) { + Ok(s) => format_and_print_whitespace(s, options), + Err(_) => Err(translate!("numfmt-error-invalid-input")), + } + }; if let Err(error_message) = handled_line { match options.invalid { @@ -85,7 +92,8 @@ fn format_and_handle_validation(input_line: &str, options: &NumfmtOptions) -> UR } InvalidModes::Ignore => {} } - println!("{input_line}"); + std::io::stdout().write_all(input_line)?; + std::io::stdout().write_all(&[eol])?; } Ok(()) @@ -150,6 +158,22 @@ fn parse_unit_size_suffix(s: &str) -> Option { None } +/// Parse delimiter argument, ensuring it's a single character. +/// For non-UTF8 locales, we allow up to 4 bytes (max UTF-8 char length). +fn parse_delimiter(arg: &OsString) -> Result> { + let bytes = os_str_as_bytes(arg).map_err(|e| e.to_string())?; + // TODO: Cut, NL and here need to find a better way to do locale specific character count + if arg.to_str().is_some_and(|s| s.chars().count() > 1) + || (arg.to_str().is_none() && bytes.len() > 4) + { + Err(translate!( + "numfmt-error-delimiter-must-be-single-character" + )) + } else { + Ok(bytes.to_vec()) + } +} + fn parse_options(args: &ArgMatches) -> Result { let from = parse_unit(args.get_one::(FROM).unwrap())?; let to = parse_unit(args.get_one::(TO).unwrap())?; @@ -212,15 +236,10 @@ fn parse_options(args: &ArgMatches) -> Result { )); } - let delimiter = args.get_one::(DELIMITER).map_or(Ok(None), |arg| { - if arg.len() == 1 { - Ok(Some(arg.to_owned())) - } else { - Err(translate!( - "numfmt-error-delimiter-must-be-single-character" - )) - } - })?; + let delimiter = args + .get_one::(DELIMITER) + .map(parse_delimiter) + .transpose()?; // unwrap is fine because the argument has a default value let round = match args.get_one::(ROUND).unwrap().as_str() { @@ -264,8 +283,14 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let options = parse_options(&matches).map_err(NumfmtError::IllegalArgument)?; - let result = match matches.get_many::(NUMBER) { - Some(values) => handle_args(values.map(|s| s.as_str()), &options), + let result = match matches.get_many::(NUMBER) { + Some(values) => { + let byte_args: Vec<&[u8]> = values + .map(|s| os_str_as_bytes(s).map_err(|e| e.to_string())) + .collect::, _>>() + .map_err(NumfmtError::IllegalArgument)?; + handle_args(byte_args.into_iter(), &options) + } None => { let stdin = std::io::stdin(); let mut locked_stdin = stdin.lock(); @@ -296,6 +321,7 @@ pub fn uu_app() -> Command { .short('d') .long(DELIMITER) .value_name("X") + .value_parser(ValueParser::os_string()) .help(translate!("numfmt-help-delimiter")), ) .arg( @@ -397,7 +423,12 @@ pub fn uu_app() -> Command { .help(translate!("numfmt-help-zero-terminated")) .action(ArgAction::SetTrue), ) - .arg(Arg::new(NUMBER).hide(true).action(ArgAction::Append)) + .arg( + Arg::new(NUMBER) + .hide(true) + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()), + ) } #[cfg(test)] @@ -528,7 +559,7 @@ mod tests { #[test] fn args_fail_returns_status_2_for_invalid_input() { - let input_value = ["5", "4Q"].into_iter(); + let input_value = [b"5".as_slice(), b"4Q"].into_iter(); let mut options = get_valid_options(); options.invalid = InvalidModes::Fail; handle_args(input_value, &options).unwrap(); @@ -541,7 +572,7 @@ mod tests { #[test] fn args_warn_returns_status_0_for_invalid_input() { - let input_value = ["5", "4Q"].into_iter(); + let input_value = [b"5".as_slice(), b"4Q"].into_iter(); let mut options = get_valid_options(); options.invalid = InvalidModes::Warn; let result = handle_args(input_value, &options); diff --git a/src/uu/numfmt/src/options.rs b/src/uu/numfmt/src/options.rs index eaf0d8b8b48..a8d16bda9f6 100644 --- a/src/uu/numfmt/src/options.rs +++ b/src/uu/numfmt/src/options.rs @@ -50,7 +50,7 @@ pub struct NumfmtOptions { pub padding: isize, pub header: usize, pub fields: Vec, - pub delimiter: Option, + pub delimiter: Option>, pub round: RoundMethod, pub suffix: Option, pub unit_separator: String, diff --git a/tests/by-util/test_numfmt.rs b/tests/by-util/test_numfmt.rs index 610e84adb52..241286d07c6 100644 --- a/tests/by-util/test_numfmt.rs +++ b/tests/by-util/test_numfmt.rs @@ -1116,6 +1116,25 @@ fn test_zero_terminated_embedded_newline() { .stdout_is("1000 2000\x003000 4000\x00"); } +#[cfg(unix)] +#[test] +fn test_non_utf8_delimiter() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // Single-byte non-UTF8 (0xFF) and multi-byte (0xA2E3, e.g. GB18030) + for delim in [&[0xFFu8][..], &[0xA2, 0xE3]] { + let input: Vec = [b"1", delim, b"2K"].concat(); + let expected: Vec = [b"1", delim, b"2000\n"].concat(); + new_ucmd!() + .args(&["--from=si", "--field=2", "-d"]) + .arg(OsStr::from_bytes(delim)) + .arg(OsStr::from_bytes(&input)) + .succeeds() + .stdout_is_bytes(expected); + } +} + #[test] fn test_unit_separator() { for (args, expected) in [