From ca93eedec6a1c9af237282daa1b445af4919b8c3 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sat, 2 Nov 2024 11:53:38 -0400 Subject: [PATCH] Now only compare ngrams of the same size --- src/config.rs | 2 +- src/lib.rs | 10 ++++++---- src/rules/similar_filename.rs | 13 +++++++++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/config.rs b/src/config.rs index b606f29..7d3cccd 100644 --- a/src/config.rs +++ b/src/config.rs @@ -46,7 +46,7 @@ pub struct Config { #[builder(default=r"-|_|\s".to_owned())] pub filename_spacing_pattern: String, /// See [`self::cli::Config::filename_match_threshold`] - #[builder(default = 100)] + #[builder(default = 95)] pub filename_match_threshold: i64, /// See [`self::cli::Config::exclude`] #[builder(default=vec![])] diff --git a/src/lib.rs b/src/lib.rs index 2669fe2..55fbc98 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,14 +56,16 @@ pub fn lib(config: &config::Config) -> Result { // All our reports // NOTE: Always use `filter_by_excludes` and `dedupe_by_code` on the reports - let similar_filenames = SimilarFilename::calculate( + let mut similar_filenames = SimilarFilename::calculate( &file_ngrams, config.filename_match_threshold, &filename_spacing_regex, ) - .map_err(|e| miette!("From SimilarFilename: {e}"))? - .filter_by_excludes(config.exclude.clone()) - .dedupe_by_code(); + .map_err(|e| miette!("From SimilarFilename: {e}"))?; + similar_filenames.sort_by(|b, a| a.partial_cmp(b).expect("This never fails")); + let similar_filenames = similar_filenames + .filter_by_excludes(config.exclude.clone()) + .dedupe_by_code(); let duplicate_aliases = DuplicateAlias::calculate(get_files(config.directories.clone()), config) diff --git a/src/rules/similar_filename.rs b/src/rules/similar_filename.rs index 3b29001..61060eb 100644 --- a/src/rules/similar_filename.rs +++ b/src/rules/similar_filename.rs @@ -16,6 +16,8 @@ pub struct SimilarFilename { /// Used to identify the diagnostic and exclude it if needed id: String, + score: i64, + #[source_code] filepaths: String, @@ -29,6 +31,12 @@ pub struct SimilarFilename { advice: String, } +impl PartialOrd for SimilarFilename { + fn partial_cmp(&self, other: &Self) -> Option { + self.score.partial_cmp(&other.score) + } +} + impl PartialEq for SimilarFilename { fn eq(&self, other: &Self) -> bool { self.id == other.id @@ -109,6 +117,7 @@ impl SimilarFilename { ); Ok(Self { id, + score, filepaths, file1_ngram, file2_ngram, @@ -134,6 +143,10 @@ impl SimilarFilename { for (i, (ngram, filepath)) in file_ngrams.clone().iter().enumerate() { // We can start at i + 1 because we've already checked all previous files for (other_ngram, other_filepath) in file_ngrams.iter().skip(i + 1) { + if ngram.nb_words() != other_ngram.nb_words() { + continue; + } + file_crosscheck_bar.inc(1); // Skip if the same file