diff --git a/src/align.rs b/src/align.rs index 92262cb1f..b2e2bdfee 100644 --- a/src/align.rs +++ b/src/align.rs @@ -1,14 +1,14 @@ use std::cmp::max; use std::collections::VecDeque; -const SUBSTITUTION_COST: usize = 1; -const DELETION_COST: usize = 1; -const INSERTION_COST: usize = 1; +const DELETION_COST: usize = 2; +const INSERTION_COST: usize = 2; +// extra cost for starting a new group of changed tokens +const INITIAL_MISMATCH_PENALITY: usize = 1; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Operation { NoOp, - Substitution, Deletion, Insertion, } @@ -61,14 +61,14 @@ impl<'a> Alignment<'a> { self.table[i] = Cell { parent: 0, operation: Deletion, - cost: i, + cost: i * DELETION_COST + INITIAL_MISMATCH_PENALITY, }; } for j in 1..self.dim[0] { self.table[j * self.dim[1]] = Cell { parent: 0, operation: Insertion, - cost: j, + cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALITY, }; } @@ -76,22 +76,32 @@ impl<'a> Alignment<'a> { for (j, y_j) in self.y.iter().enumerate() { let (left, diag, up) = (self.index(i, j + 1), self.index(i, j), self.index(i + 1, j)); + // The order of the candidates matters if two of them have the + // same cost as in that case we choose the first one. Consider + // insertions and deletions before matches in order to group + // changes together. Insertions are preferred to deletions in + // order to highlight moved tokens as a deletion followed by an + // insertion (as the edit sequence is read backwards we need to + // choose the insertion first) let candidates = [ + Cell { + parent: up, + operation: Insertion, + cost: self.mismatch_cost(up, INSERTION_COST), + }, Cell { parent: left, operation: Deletion, - cost: self.table[left].cost + DELETION_COST, + cost: self.mismatch_cost(left, DELETION_COST), }, Cell { parent: diag, - operation: if x_i == y_j { NoOp } else { Substitution }, - cost: self.table[diag].cost - + if x_i == y_j { 0 } else { SUBSTITUTION_COST }, - }, - Cell { - parent: up, - operation: Insertion, - cost: self.table[up].cost + INSERTION_COST, + operation: NoOp, + cost: if x_i == y_j { + self.table[diag].cost + } else { + usize::MAX + }, }, ]; let index = self.index(i + 1, j + 1); @@ -104,6 +114,16 @@ impl<'a> Alignment<'a> { } } + fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize { + self.table[parent].cost + + basic_cost + + if self.table[parent].operation == NoOp { + INITIAL_MISMATCH_PENALITY + } else { + 0 + } + } + /// Read edit operations from the table. pub fn operations(&self) -> Vec { let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len())); @@ -161,7 +181,6 @@ impl<'a> Alignment<'a> { let parent = &self.table[cell.parent]; let op = match cell.operation { Deletion => "-", - Substitution => "*", Insertion => "+", NoOp => ".", }; @@ -240,68 +259,176 @@ mod tests { #[test] fn test_0() { - let (before, after) = ("aaa", "aba"); - assert_string_distance_parts(before, after, (1, 3)); - assert_eq!(operations(before, after), vec![NoOp, Substitution, NoOp,]); + TestCase { + before: "aaa", + after: "aba", + distance: 5, + parts: (2, 4), + operations: vec![NoOp, Deletion, Insertion, NoOp], + } + .run(); } #[test] fn test_0_nonascii() { - let (before, after) = ("ááb", "áaa"); - assert_string_distance_parts(before, after, (2, 3)); - assert_eq!( - operations(before, after), - vec![NoOp, Substitution, Substitution,] - ); + TestCase { + before: "ááb", + after: "áaa", + distance: 9, + parts: (4, 5), + operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion], + } + .run(); } #[test] fn test_1() { - let (before, after) = ("kitten", "sitting"); - assert_string_distance_parts(before, after, (3, 7)); - assert_eq!( - operations(before, after), - vec![ - Substitution, // K S - NoOp, // I I - NoOp, // T T - NoOp, // T T - Substitution, // E I - NoOp, // N N - Insertion // - G - ] - ); + TestCase { + before: "kitten", + after: "sitting", + distance: 13, + parts: (5, 9), + operations: vec![ + Deletion, // K - + Insertion, // - S + NoOp, // I I + NoOp, // T T + NoOp, // T T + Deletion, // E - + Insertion, // - I + NoOp, // N N + Insertion, // - G + ], + } + .run(); } #[test] fn test_2() { - let (before, after) = ("saturday", "sunday"); - assert_string_distance_parts(before, after, (3, 8)); - assert_eq!( - operations(before, after), - vec![ - NoOp, // S S - Deletion, // A - - Deletion, // T - - NoOp, // U U - Substitution, // R N - NoOp, // D D - NoOp, // A A - NoOp // Y Y - ] - ); + TestCase { + before: "saturday", + after: "sunday", + distance: 10, + parts: (4, 9), + operations: vec![ + NoOp, // S S + Deletion, // A - + Deletion, // T - + NoOp, // U U + Deletion, // R - + Insertion, // - N + NoOp, // D D + NoOp, // A A + NoOp, // Y Y + ], + } + .run(); } - fn assert_string_distance_parts(s1: &str, s2: &str, parts: (usize, usize)) { - let (numer, _) = parts; - assert_string_levenshtein_distance(s1, s2, numer); - assert_eq!(string_distance_parts(s1, s2), parts); - assert_eq!(string_distance_parts(s2, s1), parts); + #[test] + fn test_3() { + TestCase { + // Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion] + before: "ab", + after: "ba", + distance: 6, + parts: (2, 3), + operations: vec![ + Deletion, // a - + NoOp, // b b + Insertion, // - a + ], + } + .run(); + } + + #[test] + fn test_4() { + // Deletions are grouped together. + TestCase { + before: "AABB", + after: "AB", + distance: 5, + parts: (2, 4), + operations: vec![ + NoOp, // A A + Deletion, // A - + Deletion, // B - + NoOp, // B B + ], + } + .run(); } - fn assert_string_levenshtein_distance(s1: &str, s2: &str, d: usize) { - assert_eq!(string_levenshtein_distance(s1, s2), d); - assert_eq!(string_levenshtein_distance(s2, s1), d); + #[test] + fn test_5() { + // Insertions are grouped together. + TestCase { + before: "AB", + after: "AABB", + distance: 5, + parts: (2, 4), + operations: vec![ + NoOp, // A A + Insertion, // - A + Insertion, // - B + NoOp, // B B + ], + } + .run(); + } + + #[test] + fn test_6() { + // Insertion and Deletion are grouped together. + TestCase { + before: "AAABBB", + after: "ACB", + distance: 11, + parts: (5, 7), + operations: vec![ + NoOp, // A A + Deletion, // A - + Deletion, // A - + Deletion, // B - + Deletion, // B - + Insertion, // - C + NoOp, // B B + ], + } + .run(); + } + + struct TestCase<'a> { + before: &'a str, + after: &'a str, + distance: usize, + parts: (usize, usize), + operations: Vec, + } + + impl<'a> TestCase<'a> { + pub fn run(&self) -> () { + self.assert_string_distance_parts(); + assert_eq!(operations(self.before, self.after), self.operations); + } + + fn assert_string_distance_parts(&self) { + self.assert_string_levenshtein_distance(); + assert_eq!(string_distance_parts(self.before, self.after), self.parts); + assert_eq!(string_distance_parts(self.after, self.before), self.parts); + } + + fn assert_string_levenshtein_distance(&self) { + assert_eq!( + string_levenshtein_distance(self.before, self.after), + self.distance + ); + assert_eq!( + string_levenshtein_distance(self.after, self.before), + self.distance + ); + } } fn string_distance_parts(x: &str, y: &str) -> (usize, usize) { diff --git a/src/edits.rs b/src/edits.rs index 9ee4d199c..bcbead11b 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -191,23 +191,11 @@ where *substrings_offset += n; &line[old_offset..*line_offset] }; - let mut minus_section = |n| { - get_section( - n, - &mut minus_line_offset, - &mut x_offset, - &alignment.x, - minus_line, - ) + let mut minus_section = |n: usize, offset: &mut usize| { + get_section(n, &mut minus_line_offset, offset, &alignment.x, minus_line) }; - let mut plus_section = |n| { - get_section( - n, - &mut plus_line_offset, - &mut y_offset, - &alignment.y, - plus_line, - ) + let mut plus_section = |n: usize, offset: &mut usize| { + get_section(n, &mut plus_line_offset, offset, &alignment.y, plus_line) }; let distance_contribution = |section: &str| UnicodeWidthStr::width(section.trim()); @@ -215,7 +203,7 @@ where for (op, n) in alignment.coalesced_operations() { match op { align::Operation::Deletion => { - let minus_section = minus_section(n); + let minus_section = minus_section(n, &mut x_offset); let n_d = distance_contribution(minus_section); d_denom += n_d; d_numer += n_d; @@ -223,12 +211,14 @@ where minus_op_prev = deletion; } align::Operation::NoOp => { - let minus_section = minus_section(n); + let minus_section = minus_section(n, &mut x_offset); let n_d = distance_contribution(minus_section); - d_denom += n_d; + d_denom += 2 * n_d; let is_space = minus_section.trim().is_empty(); let coalesce_space_with_previous = is_space - && ((minus_op_prev == deletion && plus_op_prev == insertion) + && ((minus_op_prev == deletion + && plus_op_prev == insertion + && (x_offset < alignment.x.len() - 1 || y_offset < alignment.y.len() - 1)) || (minus_op_prev == noop_deletion && plus_op_prev == noop_insertion)); annotated_minus_line.push(( if coalesce_space_with_previous { @@ -244,23 +234,13 @@ where } else { noop_insertion }, - plus_section(n), + plus_section(n, &mut y_offset), )); minus_op_prev = noop_deletion; plus_op_prev = noop_insertion; } - align::Operation::Substitution => { - let minus_section = minus_section(n); - let n_d = distance_contribution(minus_section); - d_denom += n_d; - d_numer += n_d; - annotated_minus_line.push((deletion, minus_section)); - annotated_plus_line.push((insertion, plus_section(n))); - minus_op_prev = deletion; - plus_op_prev = insertion; - } align::Operation::Insertion => { - let plus_section = plus_section(n); + let plus_section = plus_section(n, &mut y_offset); let n_d = distance_contribution(plus_section); d_denom += n_d; d_numer += n_d; @@ -631,13 +611,13 @@ mod tests { vec!["fn coalesce_edits<'a, 'b, EditOperation>("], ( vec![vec![ - (MinusNoop, "fn coalesce_edits<'a"), - (MinusNoop, ", EditOperation>("), + (MinusNoop, "fn coalesce_edits<'a, "), + (MinusNoop, "EditOperation>("), ]], vec![vec![ - (PlusNoop, "fn coalesce_edits<'a"), - (Insertion, ", 'b"), - (PlusNoop, ", EditOperation>("), + (PlusNoop, "fn coalesce_edits<'a, "), + (Insertion, "'b, "), + (PlusNoop, "EditOperation>("), ]], ), 0.66, @@ -652,15 +632,15 @@ mod tests { ( vec![vec![ (MinusNoop, "for _ in range(0, "), - (MinusNoop, "options[\"count\"]"), - (MinusNoop, "):"), + (MinusNoop, "options[\"count\"])"), + (MinusNoop, ":"), ]], vec![vec![ (PlusNoop, "for _ in range(0, "), (Insertion, "int("), - (PlusNoop, "options[\"count\"]"), + (PlusNoop, "options[\"count\"])"), (Insertion, ")"), - (PlusNoop, "):"), + (PlusNoop, ":"), ]], ), 0.3, @@ -673,8 +653,8 @@ mod tests { vec!["a a"], vec!["a b a"], ( - vec![vec![(MinusNoop, "a"), (MinusNoop, " a")]], - vec![vec![(PlusNoop, "a"), (Insertion, " b"), (PlusNoop, " a")]], + vec![vec![(MinusNoop, "a "), (MinusNoop, "a")]], + vec![vec![(PlusNoop, "a "), (Insertion, "b "), (PlusNoop, "a")]], ), 1.0, ); @@ -682,8 +662,8 @@ mod tests { vec!["a a"], vec!["a b b a"], ( - vec![vec![(MinusNoop, "a"), (MinusNoop, " a")]], - vec![vec![(PlusNoop, "a"), (Insertion, " b b"), (PlusNoop, " a")]], + vec![vec![(MinusNoop, "a "), (MinusNoop, "a")]], + vec![vec![(PlusNoop, "a "), (Insertion, "b b "), (PlusNoop, "a")]], ), 1.0, ); @@ -698,20 +678,17 @@ mod tests { // TODO: Coalesce runs of the same operation. vec![vec![ (MinusNoop, "so it is safe to read "), - (Deletion, "the"), + (Deletion, "the commit"), (Deletion, " "), - (Deletion, "commit"), - (Deletion, " "), - (Deletion, "number "), - (MinusNoop, "from any one of them."), + (Deletion, "number"), + (MinusNoop, " from any one of them."), ]], vec![vec![ (PlusNoop, "so it is safe to read "), (Insertion, "build"), (Insertion, " "), (Insertion, "info"), - (Insertion, " "), - (PlusNoop, "from any one of them."), + (PlusNoop, " from any one of them."), ]], ), 1.0, @@ -735,7 +712,6 @@ mod tests { } #[test] - #[ignore] fn test_infer_edits_12() { assert_edits( vec![" (xxxxxxxxx, \"build info\"),"], @@ -755,6 +731,119 @@ mod tests { ); } + #[test] + fn test_infer_edits_13() { + assert_paired_edits( + vec!["'b '", "[element,]"], + vec!["' b'", "[element],"], + ( + vec![ + vec![ + (MinusNoop, "'"), + (Deletion, "b"), + (MinusNoop, " "), + (MinusNoop, "'"), + ], + vec![(MinusNoop, "[element"), (Deletion, ","), (MinusNoop, "]")], + ], + vec![ + vec![ + (PlusNoop, "'"), + (PlusNoop, " "), + (Insertion, "b"), + (PlusNoop, "'"), + ], + vec![(PlusNoop, "[element"), (PlusNoop, "]"), (Insertion, ",")], + ], + ), + ); + } + + #[test] + fn test_infer_edits_14() { + assert_paired_edits( + vec!["a b c d ", "p "], + vec!["x y c z ", "q r"], + ( + vec![ + vec![ + (MinusNoop, ""), + (Deletion, "a"), + (Deletion, " "), + (Deletion, "b"), + (MinusNoop, " c "), + (Deletion, "d"), + (MinusNoop, " "), + ], + vec![(MinusNoop, ""), (Deletion, "p"), (Deletion, " ")], + ], + vec![ + vec![ + (PlusNoop, ""), + (Insertion, "x"), + (Insertion, " "), + (Insertion, "y"), + (PlusNoop, " c "), + (Insertion, "z"), + (PlusNoop, " "), + ], + vec![ + (PlusNoop, ""), + (Insertion, "q"), + (Insertion, " "), + (Insertion, "r"), + ], + ], + ), + ); + } + + #[test] + fn test_infer_edits_15() { + assert_paired_edits( + vec![r#"printf "%s\n" s y y | git add -p &&"#], + vec!["test_write_lines s y y | git add -p &&"], + ( + vec![vec![ + (MinusNoop, ""), + (Deletion, r#"printf "%s\n""#), + (MinusNoop, " s y y | git add -p &&"), + ]], + vec![vec![ + (PlusNoop, ""), + (Insertion, "test_write_lines"), + (PlusNoop, " s y y | git add -p &&"), + ]], + ), + ); + } + + #[test] + fn test_infer_edits_16() { + assert_edits( + vec!["a a a a a a b b b"], + vec!["c a a a a a a c c"], + ( + vec![vec![ + (MinusNoop, ""), + (MinusNoop, "a a a a a a "), + (Deletion, "b b"), + (Deletion, " "), + (Deletion, "b"), + ]], + vec![vec![ + (PlusNoop, ""), + (Insertion, "c "), + (PlusNoop, "a a a a a a "), + (Insertion, "c"), + (Insertion, " "), + (Insertion, "c"), + ]], + ), + 1.0, + ); + } + fn assert_edits( minus_lines: Vec<&str>, plus_lines: Vec<&str>,