Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Highlighting improvements #1244

Merged
merged 6 commits into from
Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 189 additions & 62 deletions src/align.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use std::cmp::max;
use std::collections::VecDeque;

const SUBSTITUTION_COST: usize = 1;
const DELETION_COST: usize = 1;
const INSERTION_COST: usize = 1;
const DELETION_COST: usize = 2;
const INSERTION_COST: usize = 2;
// extra cost for starting a new group of changed tokens
const INITIAL_MISMATCH_PENALITY: usize = 1;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Operation {
NoOp,
Substitution,
Deletion,
Insertion,
}
Expand Down Expand Up @@ -61,37 +61,47 @@ impl<'a> Alignment<'a> {
self.table[i] = Cell {
parent: 0,
operation: Deletion,
cost: i,
cost: i * DELETION_COST + INITIAL_MISMATCH_PENALITY,
};
}
for j in 1..self.dim[0] {
self.table[j * self.dim[1]] = Cell {
parent: 0,
operation: Insertion,
cost: j,
cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALITY,
};
}

for (i, x_i) in self.x.iter().enumerate() {
for (j, y_j) in self.y.iter().enumerate() {
let (left, diag, up) =
(self.index(i, j + 1), self.index(i, j), self.index(i + 1, j));
// The order of the candidates matters if two of them have the
// same cost as in that case we choose the first one. Consider
// insertions and deletions before matches in order to group
// changes together. Insertions are preferred to deletions in
// order to highlight moved tokens as a deletion followed by an
// insertion (as the edit sequence is read backwards we need to
// choose the insertion first)
let candidates = [
Cell {
parent: up,
operation: Insertion,
cost: self.mismatch_cost(up, INSERTION_COST),
},
Cell {
parent: left,
operation: Deletion,
cost: self.table[left].cost + DELETION_COST,
cost: self.mismatch_cost(left, DELETION_COST),
},
Cell {
parent: diag,
operation: if x_i == y_j { NoOp } else { Substitution },
cost: self.table[diag].cost
+ if x_i == y_j { 0 } else { SUBSTITUTION_COST },
},
Cell {
parent: up,
operation: Insertion,
cost: self.table[up].cost + INSERTION_COST,
operation: NoOp,
cost: if x_i == y_j {
self.table[diag].cost
} else {
usize::MAX
},
},
];
let index = self.index(i + 1, j + 1);
Expand All @@ -104,6 +114,16 @@ impl<'a> Alignment<'a> {
}
}

fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize {
self.table[parent].cost
+ basic_cost
+ if self.table[parent].operation == NoOp {
INITIAL_MISMATCH_PENALITY
} else {
0
}
}

/// Read edit operations from the table.
pub fn operations(&self) -> Vec<Operation> {
let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len()));
Expand Down Expand Up @@ -161,7 +181,6 @@ impl<'a> Alignment<'a> {
let parent = &self.table[cell.parent];
let op = match cell.operation {
Deletion => "-",
Substitution => "*",
Insertion => "+",
NoOp => ".",
};
Expand Down Expand Up @@ -240,68 +259,176 @@ mod tests {

#[test]
fn test_0() {
let (before, after) = ("aaa", "aba");
assert_string_distance_parts(before, after, (1, 3));
assert_eq!(operations(before, after), vec![NoOp, Substitution, NoOp,]);
TestCase {
before: "aaa",
after: "aba",
distance: 5,
parts: (2, 4),
operations: vec![NoOp, Deletion, Insertion, NoOp],
}
.run();
}

#[test]
fn test_0_nonascii() {
let (before, after) = ("ááb", "áaa");
assert_string_distance_parts(before, after, (2, 3));
assert_eq!(
operations(before, after),
vec![NoOp, Substitution, Substitution,]
);
TestCase {
before: "ááb",
after: "áaa",
distance: 9,
parts: (4, 5),
operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion],
}
.run();
}

#[test]
fn test_1() {
let (before, after) = ("kitten", "sitting");
assert_string_distance_parts(before, after, (3, 7));
assert_eq!(
operations(before, after),
vec![
Substitution, // K S
NoOp, // I I
NoOp, // T T
NoOp, // T T
Substitution, // E I
NoOp, // N N
Insertion // - G
]
);
TestCase {
before: "kitten",
after: "sitting",
distance: 13,
parts: (5, 9),
operations: vec![
Deletion, // K -
Insertion, // - S
NoOp, // I I
NoOp, // T T
NoOp, // T T
Deletion, // E -
Insertion, // - I
NoOp, // N N
Insertion, // - G
],
}
.run();
}

#[test]
fn test_2() {
let (before, after) = ("saturday", "sunday");
assert_string_distance_parts(before, after, (3, 8));
assert_eq!(
operations(before, after),
vec![
NoOp, // S S
Deletion, // A -
Deletion, // T -
NoOp, // U U
Substitution, // R N
NoOp, // D D
NoOp, // A A
NoOp // Y Y
]
);
TestCase {
before: "saturday",
after: "sunday",
distance: 10,
parts: (4, 9),
operations: vec![
NoOp, // S S
Deletion, // A -
Deletion, // T -
NoOp, // U U
Deletion, // R -
Insertion, // - N
NoOp, // D D
NoOp, // A A
NoOp, // Y Y
],
}
.run();
}

fn assert_string_distance_parts(s1: &str, s2: &str, parts: (usize, usize)) {
let (numer, _) = parts;
assert_string_levenshtein_distance(s1, s2, numer);
assert_eq!(string_distance_parts(s1, s2), parts);
assert_eq!(string_distance_parts(s2, s1), parts);
#[test]
fn test_3() {
TestCase {
// Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion]
before: "ab",
after: "ba",
distance: 6,
parts: (2, 3),
operations: vec![
Deletion, // a -
NoOp, // b b
Insertion, // - a
],
}
.run();
}

#[test]
fn test_4() {
// Deletions are grouped together.
TestCase {
before: "AABB",
after: "AB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, // A A
Deletion, // A -
Deletion, // B -
NoOp, // B B
],
}
.run();
}

fn assert_string_levenshtein_distance(s1: &str, s2: &str, d: usize) {
assert_eq!(string_levenshtein_distance(s1, s2), d);
assert_eq!(string_levenshtein_distance(s2, s1), d);
#[test]
fn test_5() {
// Insertions are grouped together.
TestCase {
before: "AB",
after: "AABB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, // A A
Insertion, // - A
Insertion, // - B
NoOp, // B B
],
}
.run();
}

#[test]
fn test_6() {
// Insertion and Deletion are grouped together.
TestCase {
before: "AAABBB",
after: "ACB",
distance: 11,
parts: (5, 7),
operations: vec![
NoOp, // A A
Deletion, // A -
Deletion, // A -
Deletion, // B -
Deletion, // B -
Insertion, // - C
NoOp, // B B
],
}
.run();
}

struct TestCase<'a> {
before: &'a str,
after: &'a str,
distance: usize,
parts: (usize, usize),
operations: Vec<Operation>,
}

impl<'a> TestCase<'a> {
pub fn run(&self) -> () {
self.assert_string_distance_parts();
assert_eq!(operations(self.before, self.after), self.operations);
}

fn assert_string_distance_parts(&self) {
self.assert_string_levenshtein_distance();
assert_eq!(string_distance_parts(self.before, self.after), self.parts);
assert_eq!(string_distance_parts(self.after, self.before), self.parts);
}

fn assert_string_levenshtein_distance(&self) {
assert_eq!(
string_levenshtein_distance(self.before, self.after),
self.distance
);
assert_eq!(
string_levenshtein_distance(self.after, self.before),
self.distance
);
}
}

fn string_distance_parts(x: &str, y: &str) -> (usize, usize) {
Expand Down
Loading