Skip to content

Commit

Permalink
Update fancy-regex to v0.13.0
Browse files Browse the repository at this point in the history
After this update clippy began to warn about the size of the `TokenizerError`
enum due to changes in `fancy_regex::Error`. To resolve this, box the value.
  • Loading branch information
robertknight committed Jul 28, 2024
1 parent 51f6e29 commit 57d8bf6
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 20 deletions.
19 changes: 4 additions & 15 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion rten-text/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ include = ["/src", "/README.md"]
crate-type = ["lib"]

[dependencies]
fancy-regex = { version = "0.12.0", default-features = false, features = ["std", "unicode"] }
fancy-regex = { version = "0.13.0", default-features = false, features = ["std", "unicode"] }
unicode_categories = "0.1.1"
unicode-normalization = "0.1.22"
serde = { workspace = true, features = ["derive"] }
Expand Down
2 changes: 1 addition & 1 deletion rten-text/src/tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ pub enum TokenizerError {
InvalidTokenId(TokenId),

/// Splitting the input with a regex failed.
RegexSplitFailed(fancy_regex::Error),
RegexSplitFailed(Box<fancy_regex::Error>),

/// There was an error parsing a byte sequence as a UTF-8 string.
///
Expand Down
6 changes: 3 additions & 3 deletions rten-text/src/tokenizers/bpe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub enum BpeError {
InvalidMergeEntry(String),

/// The regex for splitting tokens is invalid.
InvalidPattern(fancy_regex::Error),
InvalidPattern(Box<fancy_regex::Error>),

/// An entry in the vocab (token string to ID map) is not either a known
/// special token or an entry in the merge list.
Expand Down Expand Up @@ -289,7 +289,7 @@ impl Bpe {
vocab: Option<HashMap<EncodedBytes, TokenId>>,
added_tokens: HashMap<TokenId, String>,
) -> Result<Bpe, BpeError> {
let splitter = Regex::new(pattern).map_err(BpeError::InvalidPattern)?;
let splitter = Regex::new(pattern).map_err(|err| BpeError::InvalidPattern(err.into()))?;

let mut builder = BpeBuilder::new();
builder.add_merges(merges)?;
Expand Down Expand Up @@ -428,7 +428,7 @@ impl Encoder for Bpe {
on_token: &mut dyn FnMut(usize, TokenId),
) -> Result<(), TokenizerError> {
for piece in self.splitter.find_iter(text) {
let piece = piece.map_err(TokenizerError::RegexSplitFailed)?;
let piece = piece.map_err(|err| TokenizerError::RegexSplitFailed(err.into()))?;
if piece.range().is_empty() {
continue;
}
Expand Down

0 comments on commit 57d8bf6

Please sign in to comment.