diff --git a/Cargo.lock b/Cargo.lock index ba3bfc17..54e8f837 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -110,12 +110,13 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "fancy-regex" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" dependencies = [ "bit-set", - "regex", + "regex-automata", + "regex-syntax", ] [[package]] @@ -336,18 +337,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "regex" -version = "1.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - [[package]] name = "regex-automata" version = "0.4.7" diff --git a/rten-text/Cargo.toml b/rten-text/Cargo.toml index 742237da..334f036f 100644 --- a/rten-text/Cargo.toml +++ b/rten-text/Cargo.toml @@ -13,7 +13,7 @@ include = ["/src", "/README.md"] crate-type = ["lib"] [dependencies] -fancy-regex = { version = "0.12.0", default-features = false, features = ["std", "unicode"] } +fancy-regex = { version = "0.13.0", default-features = false, features = ["std", "unicode"] } unicode_categories = "0.1.1" unicode-normalization = "0.1.22" serde = { workspace = true, features = ["derive"] } diff --git a/rten-text/src/tokenizers.rs b/rten-text/src/tokenizers.rs index 5832789e..e2dbadf2 100644 --- a/rten-text/src/tokenizers.rs +++ b/rten-text/src/tokenizers.rs @@ -595,7 +595,7 @@ pub enum TokenizerError { InvalidTokenId(TokenId), /// Splitting the input with a regex failed. - RegexSplitFailed(fancy_regex::Error), + RegexSplitFailed(Box), /// There was an error parsing a byte sequence as a UTF-8 string. /// diff --git a/rten-text/src/tokenizers/bpe.rs b/rten-text/src/tokenizers/bpe.rs index b33c0cc9..8fe41974 100644 --- a/rten-text/src/tokenizers/bpe.rs +++ b/rten-text/src/tokenizers/bpe.rs @@ -18,7 +18,7 @@ pub enum BpeError { InvalidMergeEntry(String), /// The regex for splitting tokens is invalid. - InvalidPattern(fancy_regex::Error), + InvalidPattern(Box), /// An entry in the vocab (token string to ID map) is not either a known /// special token or an entry in the merge list. @@ -289,7 +289,7 @@ impl Bpe { vocab: Option>, added_tokens: HashMap, ) -> Result { - let splitter = Regex::new(pattern).map_err(BpeError::InvalidPattern)?; + let splitter = Regex::new(pattern).map_err(|err| BpeError::InvalidPattern(err.into()))?; let mut builder = BpeBuilder::new(); builder.add_merges(merges)?; @@ -428,7 +428,7 @@ impl Encoder for Bpe { on_token: &mut dyn FnMut(usize, TokenId), ) -> Result<(), TokenizerError> { for piece in self.splitter.find_iter(text) { - let piece = piece.map_err(TokenizerError::RegexSplitFailed)?; + let piece = piece.map_err(|err| TokenizerError::RegexSplitFailed(err.into()))?; if piece.range().is_empty() { continue; }