From 65f2de2473bedee9ba9aaab3805e4aa8a17079d6 Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Fri, 1 Nov 2024 22:58:26 -0400 Subject: [PATCH 1/2] WIP: Are these even used anymore --- src/config.rs | 9 +++++++++ src/config/cli.rs | 4 ++-- src/config/file.rs | 17 +++++++---------- src/file/name.rs | 18 +++++++++++++++--- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/config.rs b/src/config.rs index edb67d3..8825134 100644 --- a/src/config.rs +++ b/src/config.rs @@ -26,25 +26,34 @@ pub enum Error { #[derive(Getters, Builder)] #[getset(get = "pub")] pub struct Config { + /// See [`self::cli::Config::directories`] #[builder(default=vec![PathBuf::from(".")])] directories: Vec, + /// See [`self::cli::Config::ngram_size`] #[builder(default = 2)] ngram_size: usize, + /// See [`self::cli::Config::boundary_pattern`] #[builder(default=r"\s".to_owned())] boundary_pattern: String, + /// See [`self::cli::Config::wikilink_pattern`] #[builder(default=r"#?\[\[(.*?)]]|#([A-Za-z0-9_]+)".to_owned())] wikilink_pattern: String, + /// See [`self::cli::Config::filename_spacing_pattern`] #[builder(default=r"___|__|-|_|\s".to_owned())] filename_spacing_pattern: String, + /// See [`self::cli::Config::filename_match_threshold`] #[builder(default = 2)] filename_match_threshold: i64, + /// See [`self::cli::Config::exclude`] #[builder(default=vec![])] exclude: Vec, + /// See [`self::file::Config::title_to_filepath`] #[builder(default=Ok(vec![vec![ ReplacePair::new(r"([A-Za-z0-1_-]+).md", r"\[\[$1\]\]").expect("Constant"), ReplacePair::new(r"___", r"/").expect("Constant"), ]]))] filepath_to_title: Result>, ReplacePairError>, + /// See [`self::file::Config::title_to_filepath`] #[builder(default=Ok(vec![vec![ ReplacePair::new(r"\[\[(.*?)\]\]", r"$1.md").expect("Constant"), ReplacePair::new(r"/", r"___").expect("Constant"), diff --git a/src/config/cli.rs b/src/config/cli.rs index 96ce03d..872bca9 100644 --- a/src/config/cli.rs +++ b/src/config/cli.rs @@ -23,7 +23,7 @@ pub(super) struct Config { #[clap(short = 'n', long = "ngram")] pub ngram_size: Option, - /// Regex pattern to stop n-gram generation on, like , or .") + /// Regex pattern to stop n-gram generation on, like , or . #[clap(short = 'b', long = "bound")] pub boundary_pattern: Option, @@ -31,7 +31,7 @@ pub(super) struct Config { #[clap(short = 'w', long = "wikilink")] pub wikilink_pattern: Option, - /// Regex pattern to split filenames on, like _ or -") + /// Regex pattern to split filenames on, like ___ or / #[clap(short = 's', long = "space")] pub filename_spacing_pattern: Option, diff --git a/src/config/file.rs b/src/config/file.rs index d42209f..c40f850 100644 --- a/src/config/file.rs +++ b/src/config/file.rs @@ -8,34 +8,31 @@ use super::{Error, Partial}; #[derive(Serialize, Deserialize, Debug, Default)] pub(super) struct Config { - /// The directories to search in - /// May provide more than one directory + /// See [`super::cli::Config::directories`] #[serde(default)] pub directories: Vec, - /// Size of the n-grams to generate from filenames - /// Will generate n-grams UP TO and INCLUDING this size + /// See [`super::cli::Config::ngram_size`] #[serde(default)] pub ngram_size: Option, - /// Regex pattern to stop n-gram generation on, like , or .") + /// See [`super::cli::Config::boundary_pattern`] #[serde(default)] pub boundary_pattern: Option, - /// Regex pattern for wikilinks + /// See [`super::cli::Config::wikilink_pattern`] #[serde(default)] pub wikilink_pattern: Option, - /// Regex pattern to split filenames on, like _ or -") + /// See [`super::cli::Config::filename_spacing_pattern`] #[serde(default)] pub filename_spacing_pattern: Option, - /// The minimum score to consider a match for filename ngrams + /// See [`super::cli::Config::filename_match_threshold`] #[serde(default)] pub filename_match_threshold: Option, - /// Exclude certain error codes - /// If an error code **starts with** this string, it will be excluded + /// See [`super::cli::Config::exclude`] #[serde(default)] pub exclude: Vec, diff --git a/src/file/name.rs b/src/file/name.rs index 86155ec..13a2602 100644 --- a/src/file/name.rs +++ b/src/file/name.rs @@ -5,7 +5,7 @@ use std::{ use regex::Regex; -use crate::ngrams::up_to_n; +use crate::{ngrams::up_to_n, sed::ReplacePair}; use super::get_files; @@ -25,11 +25,23 @@ pub fn get_filename(path: &Path) -> String { .to_lowercase(); } +/// Get the filename from a path +/// Does not include the file extension +/// Replaces the ___ with / (assuming logseq) +#[must_use] +pub fn get_filename_as_alias( + path: &Path, + filename_spacing_to_group_spacing: &ReplacePair, +) -> String { + let fname = get_filename(path); + ReplacePair::apply(filename_spacing_to_group_spacing, &fname) +} + /// Get the segments of a filename based on [`boundary_regex`] #[must_use] -pub fn filename_segments(path: &Path, boundary_regex: &Regex) -> Vec { +pub fn filename_segments(path: &Path, filename_spacing_regex: &Regex) -> Vec { let filename = get_filename(path); - boundary_regex + filename_spacing_regex .split(&filename) .map(std::string::ToString::to_string) .collect() From 0b9a17b7f68358ff90c0d8874aeaca330a3e2a8f Mon Sep 17 00:00:00 2001 From: Ryan Peach Date: Sat, 2 Nov 2024 01:09:48 -0400 Subject: [PATCH 2/2] Working better on my personal notes library --- src/config.rs | 55 +++++++------- src/config/cli.rs | 14 +++- src/config/file.rs | 83 ++++++++++----------- src/file/content/front_matter.rs | 16 ++-- src/file/content/wikilink.rs | 45 +++++++++++- src/file/name.rs | 102 +++++++++++++++++--------- src/lib.rs | 23 +++--- src/rules/broken_wikilink.rs | 19 ++--- src/rules/duplicate_alias.rs | 121 ++++++++++++++++++++----------- src/rules/similar_filename.rs | 11 ++- src/sed.rs | 31 ++++++-- 11 files changed, 327 insertions(+), 193 deletions(-) diff --git a/src/config.rs b/src/config.rs index 8825134..56d1db0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,10 +2,15 @@ mod cli; mod file; use std::path::PathBuf; -use crate::sed::{ReplacePair, ReplacePairError}; +use crate::{ + file::{ + content::wikilink::Alias, + name::{Filename, FilenameLowercase}, + }, + sed::{ReplacePair, ReplacePairError}, +}; use bon::Builder; use clap::Parser; -use getset::Getters; use std::io; use thiserror; use toml; @@ -23,43 +28,35 @@ pub enum Error { /// Config which contains both the cli and the config file /// Used to reconcile the two -#[derive(Getters, Builder)] -#[getset(get = "pub")] +#[derive(Builder)] pub struct Config { /// See [`self::cli::Config::directories`] #[builder(default=vec![PathBuf::from(".")])] - directories: Vec, + pub directories: Vec, /// See [`self::cli::Config::ngram_size`] #[builder(default = 2)] - ngram_size: usize, + pub ngram_size: usize, /// See [`self::cli::Config::boundary_pattern`] #[builder(default=r"\s".to_owned())] - boundary_pattern: String, + pub boundary_pattern: String, /// See [`self::cli::Config::wikilink_pattern`] #[builder(default=r"#?\[\[(.*?)]]|#([A-Za-z0-9_]+)".to_owned())] - wikilink_pattern: String, + pub wikilink_pattern: String, /// See [`self::cli::Config::filename_spacing_pattern`] #[builder(default=r"___|__|-|_|\s".to_owned())] - filename_spacing_pattern: String, + pub filename_spacing_pattern: String, /// See [`self::cli::Config::filename_match_threshold`] #[builder(default = 2)] - filename_match_threshold: i64, + pub filename_match_threshold: i64, /// See [`self::cli::Config::exclude`] #[builder(default=vec![])] - exclude: Vec, - /// See [`self::file::Config::title_to_filepath`] - #[builder(default=Ok(vec![vec![ - ReplacePair::new(r"([A-Za-z0-1_-]+).md", r"\[\[$1\]\]").expect("Constant"), - ReplacePair::new(r"___", r"/").expect("Constant"), - ]]))] - filepath_to_title: Result>, ReplacePairError>, - /// See [`self::file::Config::title_to_filepath`] - #[builder(default=Ok(vec![vec![ - ReplacePair::new(r"\[\[(.*?)\]\]", r"$1.md").expect("Constant"), - ReplacePair::new(r"/", r"___").expect("Constant"), - ReplacePair::new(r"(.*)", r"../pages/$1").expect("Constant"), - ]]))] - title_to_filepath: Result>, ReplacePairError>, + pub exclude: Vec, + /// See [`self::file::Config::filename_to_alias`] + #[builder(default=Ok(ReplacePair::new(r"___", r"/").expect("Constant")))] + pub filename_to_alias: Result, ReplacePairError>, + /// See [`self::file::Config::alias_to_filename`] + #[builder(default=Ok(ReplacePair::new(r"/", r"___").expect("Constant")))] + pub alias_to_filename: Result, ReplacePairError>, } /// Things which implement the partial config trait @@ -74,8 +71,10 @@ pub trait Partial { fn filename_spacing_pattern(&self) -> Option; fn filename_match_threshold(&self) -> Option; fn exclude(&self) -> Option>; - fn filepath_to_title(&self) -> Option>, ReplacePairError>>; - fn title_to_filepath(&self) -> Option>, ReplacePairError>>; + fn filename_to_alias(&self) -> Option, ReplacePairError>>; + fn alias_to_filename( + &self, + ) -> Option, ReplacePairError>>; } /// Now we implement a combine function for patrial configs which @@ -92,8 +91,8 @@ fn combine_partials(partials: &[&dyn Partial]) -> Config { .maybe_filename_spacing_pattern(partials.iter().find_map(|p| p.filename_spacing_pattern())) .maybe_filename_match_threshold(partials.iter().find_map(|p| p.filename_match_threshold())) .maybe_exclude(partials.iter().find_map(|p| p.exclude())) - .maybe_filepath_to_title(partials.iter().find_map(|p| p.filepath_to_title())) - .maybe_title_to_filepath(partials.iter().find_map(|p| p.title_to_filepath())) + .maybe_filename_to_alias(partials.iter().find_map(|p| p.filename_to_alias())) + .maybe_alias_to_filename(partials.iter().find_map(|p| p.alias_to_filename())) .build() } diff --git a/src/config/cli.rs b/src/config/cli.rs index 872bca9..578d5d7 100644 --- a/src/config/cli.rs +++ b/src/config/cli.rs @@ -1,7 +1,13 @@ use clap::Parser; use std::path::PathBuf; -use crate::sed::{ReplacePair, ReplacePairError}; +use crate::{ + file::{ + content::wikilink::Alias, + name::{Filename, FilenameLowercase}, + }, + sed::{ReplacePair, ReplacePairError}, +}; use super::Partial; @@ -77,10 +83,12 @@ impl Partial for Config { Some(out) } } - fn filepath_to_title(&self) -> Option>, ReplacePairError>> { + fn filename_to_alias(&self) -> Option, ReplacePairError>> { None } - fn title_to_filepath(&self) -> Option>, ReplacePairError>> { + fn alias_to_filename( + &self, + ) -> Option, ReplacePairError>> { None } } diff --git a/src/config/file.rs b/src/config/file.rs index c40f850..23bd906 100644 --- a/src/config/file.rs +++ b/src/config/file.rs @@ -2,7 +2,13 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -use crate::sed::{ReplacePair, ReplacePairError}; +use crate::{ + file::{ + content::wikilink::Alias, + name::{Filename, FilenameLowercase}, + }, + sed::{ReplacePair, ReplacePairError}, +}; use super::{Error, Partial}; @@ -36,19 +42,15 @@ pub(super) struct Config { #[serde(default)] pub exclude: Vec, - /// Link conversion to file path using sed regex - /// Each outer vec contains an inner vec of a sequence of find/replace pairs - /// Meaning you can have several different and independent sequences of find/replace pairs - /// This makes it easier to manage multiple different types of links and conversions + /// Convert an alias to a filename + /// Kinda like a sed command #[serde(default)] - pub title_to_filepath: Vec>, + pub alias_to_filename: (String, String), - /// Convert a filepath to the "title" lr name in a wikilink - /// Each outer vec contains an inner vec of a sequence of find/replace pairs - /// Meaning you can have several different and independent sequences of find/replace pairs - /// This makes it easier to manage multiple different types of links and conversions + /// Convert a filename to an alias + /// Kinda like a sed command #[serde(default)] - pub filepath_to_title: Vec>, + pub filename_to_alias: (String, String), } impl Config { @@ -97,42 +99,33 @@ impl Partial for Config { } } - fn filepath_to_title(&self) -> Option>, ReplacePairError>> { - let out = self.filepath_to_title.clone(); - if out.is_empty() { - None - } else { - let mut res = Vec::new(); - for inner in out { - let mut inner_res = Vec::new(); - for (find, replace) in inner { - match ReplacePair::new(&find, &replace) { - Ok(pair) => inner_res.push(pair), - Err(e) => return Some(Err(e)), - } - } - res.push(inner_res); - } - Some(Ok(res)) + fn alias_to_filename( + &self, + ) -> Option, ReplacePairError>> { + let (to, from) = self.alias_to_filename.clone(); + match (to.is_empty(), from.is_empty()) { + (true, true) => None, + (false, false) => Some(ReplacePair::new(&to, &from)), + (true, false) => Some(Err(ReplacePairError::ToError(regex::Error::Syntax( + "To is empty".to_string(), + )))), + (false, true) => Some(Err(ReplacePairError::FromError(regex::Error::Syntax( + "From is empty".to_string(), + )))), } } - fn title_to_filepath(&self) -> Option>, ReplacePairError>> { - let out = self.title_to_filepath.clone(); - if out.is_empty() { - None - } else { - let mut res = Vec::new(); - for inner in out { - let mut inner_res = Vec::new(); - for (find, replace) in inner { - match ReplacePair::new(&find, &replace) { - Ok(pair) => inner_res.push(pair), - Err(e) => return Some(Err(e)), - } - } - res.push(inner_res); - } - Some(Ok(res)) + + fn filename_to_alias(&self) -> Option, ReplacePairError>> { + let (to, from) = self.alias_to_filename.clone(); + match (to.is_empty(), from.is_empty()) { + (true, true) => None, + (false, false) => Some(ReplacePair::new(&to, &from)), + (true, false) => Some(Err(ReplacePairError::ToError(regex::Error::Syntax( + "To is empty".to_string(), + )))), + (false, true) => Some(Err(ReplacePairError::FromError(regex::Error::Syntax( + "From is empty".to_string(), + )))), } } } diff --git a/src/file/content/front_matter.rs b/src/file/content/front_matter.rs index d44936e..599a3ab 100644 --- a/src/file/content/front_matter.rs +++ b/src/file/content/front_matter.rs @@ -1,12 +1,12 @@ mod logseq; mod yaml; -use super::Error; +use super::{wikilink::Alias, Error}; #[derive(Debug, Default, Clone)] pub struct FrontMatter { /// The aliases of the file - pub aliases: Vec, + pub aliases: Vec, } impl FrontMatter { @@ -15,7 +15,7 @@ impl FrontMatter { let out = yaml::Config::new(contents)?; if !out.is_empty() { return Ok(FrontMatter { - aliases: out.aliases.iter().map(|x| x.to_lowercase()).collect(), + aliases: out.aliases.iter().map(|x| Alias::new(x)).collect(), }); } @@ -23,7 +23,7 @@ impl FrontMatter { let out = logseq::Config::new(contents)?; if !out.is_empty() { return Ok(FrontMatter { - aliases: out.aliases.iter().map(|x| x.to_lowercase()).collect(), + aliases: out.aliases.iter().map(|x| Alias::new(x)).collect(), }); } @@ -44,9 +44,9 @@ mod tests { assert_eq!( config.aliases, vec![ - "name1".to_string(), - "name2".to_string(), - "name3".to_string() + Alias::new("name1"), + Alias::new("name2"), + Alias::new("name3") ] ); } @@ -58,7 +58,7 @@ mod tests { let config = FrontMatter::new(text).unwrap(); assert_eq!( config.aliases, - vec!["a".to_string(), "b".to_string(), "c".to_string()] + vec![Alias::new("a"), Alias::new("b"), Alias::new("c")] ); } } diff --git a/src/file/content/wikilink.rs b/src/file/content/wikilink.rs index 240e315..628f3c1 100644 --- a/src/file/content/wikilink.rs +++ b/src/file/content/wikilink.rs @@ -1,14 +1,53 @@ +use std::fmt::{Display, Formatter}; + use bon::Builder; use getset::Getters; use itertools::Itertools; use miette::SourceSpan; -use crate::sed::RegexError; +use crate::{ + config::Config, + file::name::Filename, + sed::{RegexError, ReplacePairError}, +}; + +/// A linkable string, like that in a wikilink, or its corresponding filename +/// Aliases are always lowercase +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub struct Alias(String); + +impl Alias { + #[must_use] + pub fn new(alias: &str) -> Self { + Self(alias.to_lowercase()) + } +} + +impl Display for Alias { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for Alias { + fn from(s: String) -> Self { + Self::new(&s) + } +} + +impl Alias { + pub fn from_filename(filename: &Filename, config: &Config) -> Result { + match config.filename_to_alias.clone() { + Ok(pair) => Ok(pair.apply(filename)), + Err(e) => Err(e), + } + } +} #[derive(Builder, Getters, Clone, Debug)] #[getset(get = "pub")] pub struct Wikilink { - alias: String, + alias: Alias, span: SourceSpan, } @@ -31,7 +70,7 @@ impl Wikilink { wikilinks.push( Wikilink::builder() .span(SourceSpan::new(capture0.start().into(), capture0.len())) - .alias(alias.as_str().to_owned().to_lowercase()) + .alias(Alias::new(alias.as_str())) .build(), ); } diff --git a/src/file/name.rs b/src/file/name.rs index 13a2602..0d33280 100644 --- a/src/file/name.rs +++ b/src/file/name.rs @@ -1,50 +1,84 @@ use std::{ collections::HashMap, + fmt::{Display, Formatter}, path::{Path, PathBuf}, }; use regex::Regex; -use crate::{ngrams::up_to_n, sed::ReplacePair}; +use crate::ngrams::up_to_n; use super::get_files; +/// A filename is a representation of the file name in its original casing +/// And with its original seperators +/// but without its extension and without its path +/// +/// # Example +/// `asdf/Foo___Bar.md` -> `Foo___Bar` +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)] +pub struct Filename(String); + +impl Filename { + #[must_use] + pub fn new(filename: &str) -> Self { + Self(filename.to_owned()) + } + #[must_use] + pub fn lowercase(&self) -> FilenameLowercase { + FilenameLowercase::new(&self.0) + } +} + +impl Display for Filename { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for Filename { + fn from(s: String) -> Self { + Self::new(&s) + } +} + +/// Sometimes you are given a lowercase [`Filename`] and you have to make due +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)] +pub struct FilenameLowercase(String); + +impl FilenameLowercase { + #[must_use] + pub fn new(filename: &str) -> Self { + Self(filename.to_owned().to_lowercase()) + } +} + +impl Display for FilenameLowercase { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for FilenameLowercase { + fn from(s: String) -> Self { + Self::new(&s) + } +} + /// Get the filename from a path /// Does not include the file extension #[must_use] -pub fn get_filename(path: &Path) -> String { +pub fn get_filename(path: &Path) -> Filename { let fname = path .file_name() .expect("We were given a guaranteed file path, not a directory") .to_string_lossy(); - return fname - .split('.') - .next() - .expect("File paths will either have a file extension or not, it makes no difference") - .to_string() - .to_lowercase(); -} - -/// Get the filename from a path -/// Does not include the file extension -/// Replaces the ___ with / (assuming logseq) -#[must_use] -pub fn get_filename_as_alias( - path: &Path, - filename_spacing_to_group_spacing: &ReplacePair, -) -> String { - let fname = get_filename(path); - ReplacePair::apply(filename_spacing_to_group_spacing, &fname) -} - -/// Get the segments of a filename based on [`boundary_regex`] -#[must_use] -pub fn filename_segments(path: &Path, filename_spacing_regex: &Regex) -> Vec { - let filename = get_filename(path); - filename_spacing_regex - .split(&filename) - .map(std::string::ToString::to_string) - .collect() + return Filename::new( + fname + .split('.') + .next() + .expect("File paths will either have a file extension or not, it makes no difference"), + ); } /// Generate n-grams from the filenames found in the directories @@ -60,13 +94,15 @@ pub fn ngrams( for filepath in files { let filename = get_filename(&filepath); let ngrams = up_to_n( - &filename, + &filename.to_string(), ngram_size, boundary_regex, filename_spacing_regex, ); - log::debug!("Filename: {}, ngrams: {:?}", filename, ngrams.len()); - file_name_ngrams.insert(filename, filepath); + log::debug!("Filename: {:?}, ngrams: {:?}", filename, ngrams.len()); + for ngram in ngrams { + file_name_ngrams.insert(ngram, filepath.clone()); + } } file_name_ngrams } diff --git a/src/lib.rs b/src/lib.rs index e033093..7a25159 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,16 +43,13 @@ impl OutputReport { /// but if this library runs, even if it finds linting violations, this returns an Ok pub fn lib(config: &config::Config) -> Result { // Compile our regex patterns - let boundary_regex = regex::Regex::new(config.boundary_pattern()).map_err(|e| miette!(e))?; + let boundary_regex = regex::Regex::new(&config.boundary_pattern).map_err(|e| miette!(e))?; let filename_spacing_regex = - regex::Regex::new(config.filename_spacing_pattern()).map_err(|e| miette!(e))?; - // We need to test for this ahead of time due to caching reasons, and its just nice to put all - // the regex errors next to each other - let _wikilink_regex = regex::Regex::new(config.wikilink_pattern()).map_err(|e| miette!(e))?; + regex::Regex::new(&config.filename_spacing_pattern).map_err(|e| miette!(e))?; let file_ngrams = ngrams( - config.directories().clone(), - *config.ngram_size(), + config.directories.clone(), + config.ngram_size, &boundary_regex, &filename_spacing_regex, ); @@ -60,15 +57,15 @@ pub fn lib(config: &config::Config) -> Result { // All our reports // NOTE: Always use `filter_by_excludes` and `dedupe_by_code` on the reports let similar_filenames = - SimilarFilename::calculate(&file_ngrams, *config.filename_match_threshold()) + SimilarFilename::calculate(&file_ngrams, config.filename_match_threshold) .map_err(|e| miette!("From SimilarFilename: {e}"))? - .filter_by_excludes(config.exclude().clone()) + .filter_by_excludes(config.exclude.clone()) .dedupe_by_code(); let duplicate_aliases = - DuplicateAlias::calculate(get_files(config.directories().clone()), config) + DuplicateAlias::calculate(get_files(config.directories.clone()), config) .map_err(|e| miette!("From DuplicateAlias: {e}"))? - .filter_by_excludes(config.exclude().clone()) + .filter_by_excludes(config.exclude.clone()) .dedupe_by_code(); // Unfortunately we can't continue if we have duplicate aliases @@ -83,9 +80,9 @@ pub fn lib(config: &config::Config) -> Result { } let broken_wikilinks = - BrokenWikilink::calculate(get_files(config.directories().clone()).as_slice(), config) + BrokenWikilink::calculate(get_files(config.directories.clone()).as_slice(), config) .map_err(|e| miette!("From BrokenWikilink: {e}"))? - .filter_by_excludes(config.exclude().clone()) + .filter_by_excludes(config.exclude.clone()) .dedupe_by_code(); // Return diff --git a/src/rules/broken_wikilink.rs b/src/rules/broken_wikilink.rs index d8898c8..419a67b 100644 --- a/src/rules/broken_wikilink.rs +++ b/src/rules/broken_wikilink.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use bon::Builder; -use miette::{miette, Diagnostic, NamedSource, Result, SourceSpan}; +use miette::{Diagnostic, NamedSource, Result, SourceSpan}; use thiserror::Error; use crate::{ @@ -10,7 +10,7 @@ use crate::{ rules::duplicate_alias::DuplicateAlias, }; -use super::HasId; +use super::{duplicate_alias::CalculateError, HasId}; pub const CODE: &str = "content::wikilink::broken"; @@ -44,7 +44,10 @@ impl HasId for BrokenWikilink { } impl BrokenWikilink { - pub fn calculate(files: &[PathBuf], config: &Config) -> Result> { + pub fn calculate( + files: &[PathBuf], + config: &Config, + ) -> Result, CalculateError> { let (lookup_table, _) = DuplicateAlias::get_alias_to_path_table_and_duplicates(files.into(), config)?; @@ -53,16 +56,14 @@ impl BrokenWikilink { let mut out = Vec::new(); for file_path in files { let mut file_content = None; - let wikilinks = from_file(file_path.clone(), config.wikilink_pattern().clone()) - .map_err(|e| miette!(e))? - .wikilinks; - let filename = get_filename(file_path.as_path()); + let wikilinks = + from_file(file_path.clone(), config.wikilink_pattern.clone())?.wikilinks; + let filename = get_filename(file_path.as_path()).lowercase(); for wikilink in wikilinks { let alias = wikilink.alias(); if !lookup_table.contains_key(alias) { if file_content.is_none() { - file_content = - Some(std::fs::read_to_string(file_path).map_err(|e| miette!(e))?); + file_content = Some(std::fs::read_to_string(file_path)?); } out.push( BrokenWikilink::builder() diff --git a/src/rules/duplicate_alias.rs b/src/rules/duplicate_alias.rs index 69af624..e04fe1e 100644 --- a/src/rules/duplicate_alias.rs +++ b/src/rules/duplicate_alias.rs @@ -1,12 +1,16 @@ use std::{collections::HashMap, path::PathBuf}; -use miette::{miette, Diagnostic, NamedSource, Result, SourceOffset, SourceSpan}; +use miette::{Diagnostic, NamedSource, SourceOffset, SourceSpan}; use thiserror::Error; use crate::{ config::Config, - file::{content::from_file, name::get_filename}, - sed::MissingSubstringError, + file::{ + self, + content::{from_file, wikilink::Alias}, + name::{get_filename, Filename}, + }, + sed::{MissingSubstringError, ReplacePairError}, }; use super::HasId; @@ -22,7 +26,7 @@ pub enum DuplicateAlias { id: String, /// The filename the alias contradicts with - other_filename: String, + other_filename: Filename, /// The content of the file with the alias #[source_code] @@ -41,7 +45,7 @@ pub enum DuplicateAlias { id: String, /// The filename which contains the other duplicate alias - other_filename: String, + other_filename: Filename, /// The content of the file with the alias #[source_code] @@ -72,24 +76,47 @@ impl PartialEq for DuplicateAlias { } } +#[derive(Error, Debug)] +pub enum NewDuplicateAliasError { + #[error(transparent)] + MissingSubstringError(#[from] MissingSubstringError), + #[error(transparent)] + ReplacePairError(#[from] ReplacePairError), +} + +#[derive(Error, Debug)] +pub enum CalculateError { + #[error(transparent)] + MissingSubstringError(#[from] MissingSubstringError), + #[error(transparent)] + ReplacePairError(#[from] ReplacePairError), + #[error(transparent)] + FileError(#[from] file::Error), + #[error(transparent)] + NewDuplicateAliasError(#[from] NewDuplicateAliasError), + #[error(transparent)] + IoError(#[from] std::io::Error), +} + impl DuplicateAlias { /// Create a new diagnostic /// based on the two filenames and their similar ngrams /// pub fn new( - alias: &str, + alias: &Alias, + config: &Config, file1_path: &PathBuf, file2_path: &PathBuf, - ) -> Result { + ) -> Result { assert_ne!(file1_path, file2_path); // Create the unique id let id = format!("{CODE}::{alias}"); - if get_filename(file1_path) == alias { + if Alias::from_filename(&get_filename(file1_path), config)? == *alias { let file2_content = std::fs::read_to_string(file2_path).expect("File reported as existing"); // Find the alias - let file2_content_found = file2_content.find(alias).ok_or_else(|| { + let file2_content_found = file2_content.find(&alias.to_string()).ok_or_else(|| { MissingSubstringError::builder() .path(file2_path.clone()) .ngram(alias.to_string()) @@ -97,8 +124,10 @@ impl DuplicateAlias { })?; // Generate the spans relative to the NamedSource - let file2_content_span = - SourceSpan::new(SourceOffset::from(file2_content_found), alias.len()); + let file2_content_span = SourceSpan::new( + SourceOffset::from(file2_content_found), + alias.to_string().len(), + ); Ok(DuplicateAlias::FileNameContentDuplicate { id, @@ -107,9 +136,9 @@ impl DuplicateAlias { alias: file2_content_span, advice: format!("Delete the alias from {}", file2_path.to_string_lossy()), }) - } else if get_filename(file2_path) == alias { + } else if Alias::from_filename(&get_filename(file2_path), config)? == *alias { // This is the same as above just with path 1 and 2 flipped - Self::new(alias, file2_path, file1_path) + Self::new(alias, config, file2_path, file1_path) } else { let file1_content = std::fs::read_to_string(file1_path).expect("File reported as existing"); @@ -117,33 +146,43 @@ impl DuplicateAlias { std::fs::read_to_string(file2_path).expect("File reported as existing"); // Find the alias - let file1_content_found = file1_content.find(alias).ok_or_else(|| { - MissingSubstringError::builder() - .path(file1_path.clone()) - .ngram(alias.to_string()) - .build() - })?; - let file2_content_found = file2_content.find(alias).ok_or_else(|| { - MissingSubstringError::builder() - .path(file2_path.clone()) - .ngram(alias.to_string()) - .build() - })?; + let file1_content_found = file1_content + .to_lowercase() + .find(&alias.to_string()) + .ok_or_else(|| { + MissingSubstringError::builder() + .path(file1_path.clone()) + .ngram(alias.to_string()) + .build() + })?; + let file2_content_found = file2_content + .to_lowercase() + .find(&alias.to_string()) + .ok_or_else(|| { + MissingSubstringError::builder() + .path(file2_path.clone()) + .ngram(alias.to_string()) + .build() + })?; // Generate the spans relative to the NamedSource - let file1_content_span = - SourceSpan::new(SourceOffset::from(file1_content_found), alias.len()); - let file2_content_span = - SourceSpan::new(SourceOffset::from(file2_content_found), alias.len()); + let file1_content_span = SourceSpan::new( + SourceOffset::from(file1_content_found), + alias.to_string().len(), + ); + let file2_content_span = SourceSpan::new( + SourceOffset::from(file2_content_found), + alias.to_string().len(), + ); Ok(DuplicateAlias::FileContentContentDuplicate { id: id.clone(), - other_filename: file2_path.to_string_lossy().to_string(), + other_filename: get_filename(file2_path), src: NamedSource::new(file1_path.to_string_lossy(), file1_content), alias: file1_content_span, other: vec![DuplicateAlias::FileContentContentDuplicate { id, - other_filename: file1_path.to_string_lossy().to_string(), + other_filename: get_filename(file1_path), src: NamedSource::new(file2_path.to_string_lossy(), file2_content), alias: file2_content_span, other: vec![], @@ -152,7 +191,10 @@ impl DuplicateAlias { } } - pub fn calculate(files: Vec, config: &Config) -> Result> { + pub fn calculate( + files: Vec, + config: &Config, + ) -> Result, CalculateError> { Ok(Self::get_alias_to_path_table_and_duplicates(files, config)?.1) } @@ -160,23 +202,20 @@ impl DuplicateAlias { pub fn get_alias_to_path_table_and_duplicates( files: Vec, config: &Config, - ) -> Result<(HashMap, Vec)> { + ) -> Result<(HashMap, Vec), CalculateError> { // First we need to collect all the file names and and aliases and collect a lookup table // relating the string and the path to the file // We may hit a duplicate alias, if so we need to collect all of them and stop - let mut lookup_table = HashMap::::new(); + let mut lookup_table = HashMap::::new(); let mut duplicates: Vec = Vec::new(); for file_path in files { let filename = get_filename(file_path.as_path()); - lookup_table.insert(filename, file_path.clone()); - let front_matter = from_file(file_path.clone(), config.wikilink_pattern().clone()) - .map_err(|e| miette!(e))? - .front_matter; + lookup_table.insert(Alias::from_filename(&filename, config)?, file_path.clone()); + let front_matter = + from_file(file_path.clone(), config.wikilink_pattern.clone())?.front_matter; for alias in front_matter.aliases { if let Some(out) = lookup_table.insert(alias.clone(), file_path.clone()) { - duplicates.push( - DuplicateAlias::new(&alias, &out, &file_path).map_err(|e| miette!(e))?, - ); + duplicates.push(DuplicateAlias::new(&alias, config, &out, &file_path)?); } } } diff --git a/src/rules/similar_filename.rs b/src/rules/similar_filename.rs index a9e86d7..5c99139 100644 --- a/src/rules/similar_filename.rs +++ b/src/rules/similar_filename.rs @@ -54,19 +54,21 @@ impl SimilarFilename { // file paths as strings let file1 = file1_path.to_string_lossy().to_lowercase(); let file2 = file2_path.to_string_lossy().to_lowercase(); + let file1_ngram = file1_ngram.to_lowercase(); + let file2_ngram = file2_ngram.to_lowercase(); // Assemble the source let source = format!("{file1}\n{file2}"); let filepaths = source.clone(); // Find the ngrams in each filepath - let find1 = file1.find(file1_ngram).ok_or_else(|| { + let find1 = file1.find(&file1_ngram).ok_or_else(|| { MissingSubstringError::builder() .path(file1_path.to_path_buf()) .ngram(file1_ngram.to_string()) .build() })?; - let find2 = file2.find(file2_ngram).ok_or_else(|| { + let find2 = file2.find(&file2_ngram).ok_or_else(|| { MissingSubstringError::builder() .path(file2_path.to_path_buf()) .ngram(file2_ngram.to_string()) @@ -138,7 +140,7 @@ impl SimilarFilename { } // Score the ngrams and check if they match - let score = matcher.fuzzy_match(ngram, other_ngram); + let score = matcher.fuzzy_match(&ngram.to_string(), &other_ngram.to_string()); if let Some(score) = score { if score > filename_match_threshold { log::info!("Match! {:?} and {:?}", filepath, other_filepath); @@ -179,6 +181,7 @@ impl SimilarFilename { fn logseq_same_group(file1: &Path, file2: &Path) -> bool { let file1 = get_filename(file1); let file2 = get_filename(file2); - file1.starts_with(&file2) || file2.starts_with(&file1) + file1.to_string().starts_with(&file2.to_string()) + || file2.to_string().starts_with(&file1.to_string()) } } diff --git a/src/sed.rs b/src/sed.rs index 295ad93..626ea15 100644 --- a/src/sed.rs +++ b/src/sed.rs @@ -17,7 +17,7 @@ pub struct MissingSubstringError { ngram: String, } -#[derive(Error, Debug)] +#[derive(Clone, Error, Debug)] pub enum RegexError { #[error("The pattern is not a valid regex")] CompileError(regex::Error), @@ -25,7 +25,7 @@ pub enum RegexError { CaptureError { pattern: String, mat: String }, } -#[derive(Error, Debug)] +#[derive(Error, Debug, Clone)] pub enum ReplacePairError { #[error("The 'from' pattern is not a valid regex")] FromError(regex::Error), @@ -34,15 +34,28 @@ pub enum ReplacePairError { } /// A struct that holds a pair of regex patterns -pub struct ReplacePair { +#[derive(Clone)] +pub struct ReplacePair +where + T: ToString + From, + U: ToString + From, +{ /// The pattern to search for from: Regex, /// The pattern to replace with /// Can use capture groups from the 'from' pattern to: Regex, + /// The type of string coming in + _t: std::marker::PhantomData, + /// The type of string coming out + _u: std::marker::PhantomData, } -impl ReplacePair { +impl ReplacePair +where + T: ToString + From, + U: ToString + From, +{ /// Create a new `ReplacePair` from two regex patterns as strings /// Will return errors if the patterns are not valid regex pub fn new(from: &str, to: &str) -> Result { @@ -53,12 +66,18 @@ impl ReplacePair { Ok(ReplacePair { from: from_regex, to: to_regex, + _t: std::marker::PhantomData, + _u: std::marker::PhantomData, }) } /// Apply replacement to an input string, and return the resultant string #[must_use] - pub fn apply(&self, input: &str) -> String { - self.from.replace_all(input, self.to.as_str()).to_string() + pub fn apply(&self, input: &T) -> U { + let out = self + .from + .replace_all(&input.to_string(), self.to.as_str()) + .to_string(); + out.into() } }