From 59daef23510e30c70355b7ba25f50e555c97cd22 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Wed, 10 Jul 2024 17:34:52 +0900 Subject: [PATCH] diff: accept diff inputs by generic iterator This helps migrate internal [u8] variables to BStr. b"" literals in tests are changed to &str to get around potential type incompatibility between &[u8; N]. --- cli/src/diff_util.rs | 6 +++--- cli/src/merge_tools/builtin.rs | 2 +- lib/src/conflicts.rs | 4 ++-- lib/src/diff.rs | 38 ++++++++++++++++++++-------------- lib/src/files.rs | 6 ++---- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/cli/src/diff_util.rs b/cli/src/diff_util.rs index 4e46fc0349..ac090d4de8 100644 --- a/cli/src/diff_util.rs +++ b/cli/src/diff_util.rs @@ -852,7 +852,7 @@ fn unified_diff_hunks<'content>( right_line_range: 1..1, lines: vec![], }; - let diff = Diff::by_line(&[left_content, right_content]); + let diff = Diff::by_line([left_content, right_content]); let mut diff_hunks = diff.hunks().peekable(); while let Some(hunk) = diff_hunks.next() { match hunk { @@ -910,7 +910,7 @@ fn inline_diff_hunks<'content>( // Like Diff::default_refinement(), but doesn't try to match up contents by // lines. We know left/right_contents have no matching lines. - let mut diff = Diff::for_tokenizer(&[left_content, right_content], diff::find_word_ranges); + let mut diff = Diff::for_tokenizer([left_content, right_content], diff::find_word_ranges); diff.refine_changed_regions(diff::find_nonword_ranges); for hunk in diff.hunks() { match hunk { @@ -1128,7 +1128,7 @@ fn get_diff_stat( // TODO: this matches git's behavior, which is to count the number of newlines // in the file. but that behavior seems unhelpful; no one really cares how // many `0x0a` characters are in an image. - let diff = Diff::by_line(&[&left_content.contents, &right_content.contents]); + let diff = Diff::by_line([&left_content.contents, &right_content.contents]); let mut added = 0; let mut removed = 0; for hunk in diff.hunks() { diff --git a/cli/src/merge_tools/builtin.rs b/cli/src/merge_tools/builtin.rs index 01ac8f6dd8..a124026c54 100644 --- a/cli/src/merge_tools/builtin.rs +++ b/cli/src/merge_tools/builtin.rs @@ -225,7 +225,7 @@ fn make_diff_sections( left_contents: &str, right_contents: &str, ) -> Result>, BuiltinToolError> { - let diff = Diff::by_line(&[left_contents.as_bytes(), right_contents.as_bytes()]); + let diff = Diff::by_line([left_contents.as_bytes(), right_contents.as_bytes()]); let mut sections = Vec::new(); for hunk in diff.hunks() { match hunk { diff --git a/lib/src/conflicts.rs b/lib/src/conflicts.rs index c90aa27714..f8d586aa7e 100644 --- a/lib/src/conflicts.rs +++ b/lib/src/conflicts.rs @@ -259,12 +259,12 @@ pub fn materialize_merge_result( output.write_all(&left.0)?; continue; }; - let diff1 = Diff::by_line(&[&left.0, &right1.0]).hunks().collect_vec(); + let diff1 = Diff::by_line([&left.0, &right1.0]).hunks().collect_vec(); // Check if the diff against the next positive term is better. Since // we want to preserve the order of the terms, we don't match against // any later positive terms. if let Some(right2) = hunk.get_add(add_index + 1) { - let diff2 = Diff::by_line(&[&left.0, &right2.0]).hunks().collect_vec(); + let diff2 = Diff::by_line([&left.0, &right2.0]).hunks().collect_vec(); if diff_size(&diff2) < diff_size(&diff1) { // If the next positive term is a better match, emit // the current positive term as a snapshot and the next diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 05f536838a..cb691e5516 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -408,13 +408,13 @@ fn intersect_regions( } impl<'input> Diff<'input> { - pub fn for_tokenizer( - inputs: &[&'input [u8]], + pub fn for_tokenizer + ?Sized + 'input>( + inputs: impl IntoIterator, tokenizer: impl Fn(&[u8]) -> Vec>, ) -> Self { - assert!(!inputs.is_empty()); - let base_input = inputs[0]; - let other_inputs = inputs.iter().skip(1).copied().collect_vec(); + let mut inputs = inputs.into_iter().map(AsRef::as_ref); + let base_input = inputs.next().expect("inputs must not be empty"); + let other_inputs = inputs.collect_vec(); // First tokenize each input let base_token_ranges = tokenizer(base_input); let other_token_ranges = other_inputs @@ -471,12 +471,16 @@ impl<'input> Diff<'input> { diff } - pub fn unrefined(inputs: &[&'input [u8]]) -> Self { + pub fn unrefined + ?Sized + 'input>( + inputs: impl IntoIterator, + ) -> Self { Diff::for_tokenizer(inputs, |_| vec![]) } /// Compares `inputs` line by line. - pub fn by_line(inputs: &[&'input [u8]]) -> Self { + pub fn by_line + ?Sized + 'input>( + inputs: impl IntoIterator, + ) -> Self { Diff::for_tokenizer(inputs, find_line_ranges) } @@ -486,7 +490,9 @@ impl<'input> Diff<'input> { // That would let each user decide which hunks to refine. However, it would // probably mean that many callers repeat the same code. Perhaps it // should be possible to refine a whole diff *or* individual hunks. - pub fn default_refinement(inputs: &[&'input [u8]]) -> Self { + pub fn default_refinement + ?Sized + 'input>( + inputs: impl IntoIterator, + ) -> Self { let mut diff = Diff::for_tokenizer(inputs, find_line_ranges); diff.refine_changed_regions(find_word_ranges); diff.refine_changed_regions(find_nonword_ranges); @@ -526,7 +532,7 @@ impl<'input> Diff<'input> { slices.push(&self.other_inputs[i][changed_range]); } - let refined_diff = Diff::for_tokenizer(&slices, &tokenizer); + let refined_diff = Diff::for_tokenizer(slices, &tokenizer); for UnchangedRange { base_range, @@ -663,7 +669,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec> { return vec![DiffHunk::Different(vec![left, b""])]; } - Diff::default_refinement(&[left, right]) + Diff::default_refinement([left, right]) .hunks() .collect_vec() } @@ -967,19 +973,19 @@ mod tests { #[test] fn test_diff_single_input() { - let diff = Diff::default_refinement(&[b"abc"]); + let diff = Diff::default_refinement(["abc"]); assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]); } #[test] fn test_diff_single_empty_input() { - let diff = Diff::default_refinement(&[b""]); + let diff = Diff::default_refinement([""]); assert_eq!(diff.hunks().collect_vec(), vec![]); } #[test] fn test_diff_two_inputs_one_different() { - let diff = Diff::default_refinement(&[b"a b c", b"a X c"]); + let diff = Diff::default_refinement(["a b c", "a X c"]); assert_eq!( diff.hunks().collect_vec(), vec![ @@ -992,7 +998,7 @@ mod tests { #[test] fn test_diff_multiple_inputs_one_different() { - let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]); + let diff = Diff::default_refinement(["a b c", "a X c", "a b c"]); assert_eq!( diff.hunks().collect_vec(), vec![ @@ -1005,7 +1011,7 @@ mod tests { #[test] fn test_diff_multiple_inputs_all_different() { - let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]); + let diff = Diff::default_refinement(["a b c", "a X c", "a c X"]); assert_eq!( diff.hunks().collect_vec(), vec![ @@ -1021,7 +1027,7 @@ mod tests { fn test_diff_for_tokenizer_compacted() { // Tests that unchanged regions are compacted when using for_tokenizer() let diff = Diff::for_tokenizer( - &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"], + ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"], find_line_ranges, ); assert_eq!( diff --git a/lib/src/files.rs b/lib/src/files.rs index 8203ae3bc5..780f3e4ded 100644 --- a/lib/src/files.rs +++ b/lib/src/files.rs @@ -17,8 +17,6 @@ use std::collections::VecDeque; use std::fmt::{Debug, Error, Formatter}; -use itertools::Itertools; - use crate::diff; use crate::diff::{Diff, DiffHunk}; use crate::merge::{trivial_merge, Merge}; @@ -163,9 +161,9 @@ pub fn merge(slices: &Merge<&[u8]>) -> MergeResult { // usually done for 3-way conflicts. Are there better heuristics when there are // more than 3 parts? let num_diffs = slices.removes().len(); - let diff_inputs = slices.removes().chain(slices.adds()).copied().collect_vec(); + let diff_inputs = slices.removes().chain(slices.adds()); - let diff = Diff::by_line(&diff_inputs); + let diff = Diff::by_line(diff_inputs); let mut resolved_hunk = ContentHunk(vec![]); let mut merge_hunks: Vec> = vec![]; for diff_hunk in diff.hunks() {