From 788e3f33315d59edc3a781712483ddc646a3e7af Mon Sep 17 00:00:00 2001 From: Darkhan Kubigenov Date: Fri, 15 Sep 2023 15:36:36 +0000 Subject: [PATCH 1/4] Move the code from binary mdbook-i18n-normalize.rs into library code This refactoring is needed so that fuzzing code can import and test this logic. --- src/bin/mdbook-i18n-normalize.rs | 544 +----------------------------- src/lib.rs | 2 + src/normalize.rs | 548 +++++++++++++++++++++++++++++++ 3 files changed, 551 insertions(+), 543 deletions(-) create mode 100644 src/normalize.rs diff --git a/src/bin/mdbook-i18n-normalize.rs b/src/bin/mdbook-i18n-normalize.rs index a48641e7..b3be34f5 100644 --- a/src/bin/mdbook-i18n-normalize.rs +++ b/src/bin/mdbook-i18n-normalize.rs @@ -10,251 +10,11 @@ //! safely move to a new version of the mdbook-i18n-helpers without //! losing existing translations. -use std::collections::HashMap; -use std::fs::File; -use std::io::Read; use std::path::Path; use anyhow::{bail, Context}; -use mdbook_i18n_helpers::{extract_messages, new_cmark_parser}; -use polib::catalog::Catalog; -use polib::message::{Message, MessageFlags, MessageMutView, MessageView}; +use mdbook_i18n_helpers::normalize::normalize; use polib::po_file; -use pulldown_cmark::{Event, LinkType, Tag}; - -fn parse_source(source: &str) -> Option<(&str, usize)> { - let (path, lineno) = source.split_once(':')?; - Some((path, lineno.parse().ok()?)) -} - -fn compute_source(source: &str, delta: usize) -> String { - let mut new_source = String::with_capacity(source.len()); - - for path_lineno in source.split_whitespace() { - if !new_source.is_empty() { - new_source.push('\n'); - } - if let Some((path, lineno)) = parse_source(path_lineno) { - new_source.push_str(&format!("{path}:{}", lineno + delta)); - } else { - new_source.push_str(source); - } - } - - new_source -} - -/// Check if `text` contains one or more broken reference links. -fn has_broken_link(text: &str) -> bool { - // The return value from the callback is not important, it just - // has to return Some to generate a `LinkType::*Unknown`. - let mut callback = |_| Some(("".into(), "".into())); - new_cmark_parser(text, Some(&mut callback)).any(|event| { - matches!( - event, - Event::Start(Tag::Link( - LinkType::ReferenceUnknown | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown, - _, - _ - )) - ) - }) -} - -#[derive(Debug, Copy, Clone)] -enum MessageField { - Msgid, - Msgstr, -} - -impl MessageField { - fn project<'a>(&self, msgid: &'a str, msgstr: &'a str) -> &'a str { - match self { - MessageField::Msgid => msgid, - MessageField::Msgstr => msgstr, - } - } -} - -#[derive(Debug)] -struct SourceMap<'a> { - messages: HashMap<&'a str, Vec<(usize, &'a str, &'a str)>>, -} - -impl<'a> SourceMap<'a> { - /// Construct a map from source paths to links. - fn new(catalog: &'a Catalog) -> anyhow::Result> { - let mut messages = HashMap::<&str, Vec<_>>::new(); - for message in catalog.messages() { - let path_linenos = message - .source() - .split_whitespace() - .map(|source| parse_source(source).unwrap_or((source, 0))); - for (path, lineno) in path_linenos { - messages.entry(path).or_default().push(( - lineno, - message.msgid(), - message.msgstr().unwrap_or_default(), - )); - } - } - - for (_, value) in messages.iter_mut() { - value.sort(); - } - - Ok(SourceMap { messages }) - } - - /// Extract messages for `message`. - /// - /// Broken links are resolved using the other messages from the - /// same path in the source map. - fn extract_messages( - &self, - message: &dyn MessageView, - field: MessageField, - ) -> anyhow::Result> { - // The strategy is to parse the message alone, if possible. If - // it has a broken link, then we construct a larger document - // using all other messages with the same path. This way the - // link should be defined. - let document = field.project(message.msgid(), message.msgstr()?); - if !has_broken_link(document) { - return Ok(extract_messages(document)); - } - - // If `parse_source` fails, then `message` has more than one - // source. We won't attempt to resolve the broken link in that - // case since it is unclear which link definition to use. - let path = match parse_source(message.source()) { - Some((path, _)) => path, - None => return Ok(extract_messages(document)), - }; - - // First, we try constructing a document using other messages - // from the catalog. Catalogs from pre-0.1.0 included the link - // definitions. - // - // This will have quadratic complexity in case every message - // from `path` has a "[some text][1]" link which needs to be - // resolved using a table of link definitions as the bottom. - // However, in practice, only a few messages will have such a - // link and the whole thing seems to be fast enough. - let mut full_document = String::from(document); - for (_, msgid, msgstr) in &self.messages[path] { - let msg = field.project(msgid, msgstr); - if msg == document { - continue; - } - full_document.push_str("\n\n"); - full_document.push_str(msg); - } - - // Second, we attempt to add the original source file. - // Catalogs made with version 0.1.0 to 0.2.0 did not include - // the link definitions at all, so we need to rely on the - // source data (if we can find it). - if let Ok(mut file) = File::open(path) { - full_document.push_str("\n\n"); - let _ = file.read_to_string(&mut full_document); - } - - let mut messages = extract_messages(&full_document); - // Truncate away the messages from `full_document` which start - // after `document`. - let line_count = document.lines().count(); - if let Some(pos) = messages.iter().position(|(lineno, _)| *lineno > line_count) { - messages.truncate(pos); - } - Ok(messages) - } -} - -/// Normalize all entries in the catalog. -/// -/// Both the `msgid` and the `msgstr` fields are sent through -/// [`extract_messages`]. The resulting messages are emitted to a new -/// catalog. If the normalization produces different number of -/// messages for the `msgid` and `msgstr` fields, then the result is -/// marked fuzzy. The extra messages are dropped. -pub fn normalize(catalog: Catalog) -> anyhow::Result { - let source_map = SourceMap::new(&catalog)?; - - // Accumulate new messages here to avoid constructing a `Catalog` - // via a partial move from `catalog`. - let mut new_messages = Vec::new(); - for message in catalog.messages() { - let new_msgids = source_map.extract_messages(message, MessageField::Msgid)?; - if new_msgids.is_empty() { - // Continue if there is nothing to normalize. This can - // happen if the old `msgid` is something like "Foo" - // since we no longer extract HTML elements. - continue; - } - - let mut new_msgstrs = source_map.extract_messages(message, MessageField::Msgstr)?; - let mut flags = MessageFlags::new(); - if message.is_fuzzy() || (message.is_translated() && new_msgids.len() != new_msgstrs.len()) - { - // Keep existing fuzzy flag, or add a new one if we cannot - // split a translated message cleanly. - flags.add_flag("fuzzy"); - } - - match new_msgids.len().cmp(&new_msgstrs.len()) { - std::cmp::Ordering::Less => { - // Treat left-over translations as separate paragraphs. - // This makes normalization stable. - let tail = new_msgstrs[new_msgids.len() - 1..] - .iter() - .map(|(_, msgstr)| msgstr.as_str()) - .collect::>() - .join("\n\n"); - new_msgstrs.truncate(new_msgids.len() - 1); - new_msgstrs.push((0, tail)) - } - std::cmp::Ordering::Greater => { - // Set missing msgstr entries to "". - new_msgstrs.resize(new_msgids.len(), (0, String::new())); - } - _ => {} - } - - for ((delta, msgid), (_, msgstr)) in std::iter::zip(new_msgids, new_msgstrs) { - let new_message = Message::build_singular() - .with_source(compute_source(message.source(), delta - 1)) - .with_msgid(msgid) - .with_msgstr(msgstr) - .with_flags(flags.clone()) - .done(); - new_messages.push(new_message); - } - } - - let mut new_catalog = Catalog::new(catalog.metadata); - for new_message in new_messages { - match new_catalog.find_message_mut(None, new_message.msgid(), None) { - Some(mut message) => { - if !message.is_translated() && new_message.is_translated() { - message.set_msgstr(String::from(new_message.msgstr()?))?; - // Because we normalize messages like "# Foo" and - // "- Foo" to just "Foo", we can end up with - // duplicates. In that case, it's important to - // preserve the fuzzy flag. - if new_message.is_fuzzy() { - message.flags_mut().add_flag("fuzzy"); - } - } - message.source_mut().push('\n'); - message.source_mut().push_str(new_message.source()); - } - None => new_catalog.append_or_update(new_message), - } - } - - Ok(new_catalog) -} fn main() -> anyhow::Result<()> { let args = std::env::args().collect::>(); @@ -272,305 +32,3 @@ fn main() -> anyhow::Result<()> { Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - use polib::metadata::CatalogMetadata; - use pretty_assertions::assert_eq; - - // Create a catalog from the translation pairs given. - fn create_catalog(translations: &[(&str, &str)]) -> Catalog { - let mut catalog = Catalog::new(CatalogMetadata::new()); - for (idx, (msgid, msgstr)) in translations.iter().enumerate() { - let message = Message::build_singular() - .with_source(format!("foo.md:{idx}")) - .with_msgid(String::from(*msgid)) - .with_msgstr(String::from(*msgstr)) - .done(); - catalog.append_or_update(message); - } - catalog - } - - fn exact<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) { - (false, msgid, msgstr) - } - - fn fuzzy<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) { - (true, msgid, msgstr) - } - - #[track_caller] - fn assert_normalized_messages_eq(catalog: Catalog, expected_messages: &[(bool, &str, &str)]) { - let normalized = normalize(catalog).expect("Could not normalize"); - let messages = normalized - .messages() - .map(|msg| (msg.is_fuzzy(), msg.msgid(), msg.msgstr().unwrap())) - .collect::>(); - assert_eq!(messages, expected_messages); - } - - #[test] - fn test_normalize_untranslated() { - let catalog = create_catalog(&[("foo bar", "")]); - assert_normalized_messages_eq(catalog, &[exact("foo bar", "")]); - } - - #[test] - fn test_normalize_first_wins() { - // When two or more msgid fields are normalized the same way, - // we use the first translated entry. The other is dropped. - let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "# FOO 2")]); - assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]); - } - - #[test] - fn test_normalize_early_translation_wins() { - let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "")]); - assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]); - } - - #[test] - fn test_normalize_late_translation_wins() { - let catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]); - assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 2")]); - } - - #[test] - fn test_normalize_fuzzy_wins() { - let mut catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]); - // Make the second message fuzzy and check that this is copied - // to the normalized messages. - catalog - .messages_mut() - .nth(1) - .unwrap() - .flags_mut() - .add_flag("fuzzy"); - assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO 2")]); - } - - #[test] - fn test_normalize_softbreak() { - let catalog = create_catalog(&[("foo\nbar", "FOO\nBAR\nBAZ")]); - assert_normalized_messages_eq(catalog, &[exact("foo bar", "FOO BAR BAZ")]); - } - - #[test] - fn test_normalize_inline_link() { - let catalog = create_catalog(&[( - "foo [bar](http://example.net/) baz", - "FOO [BAR](http://example.net/) BAZ", - )]); - assert_normalized_messages_eq( - catalog, - &[exact( - "foo [bar](http://example.net/) baz", - "FOO [BAR](http://example.net/) BAZ", - )], - ); - } - - #[test] - fn test_normalize_reference_link() { - // Check that we can normalize a reference link when its link - // definition is in a different message. - let catalog = create_catalog(&[ - ("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."), - ( - "foo [bar][reference-link] baz", - "FOO [BAR][reference-link] BAZ", - ), - ("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."), - ( - "[reference-link]: http://example.net/\n\ - [other-link]: http://example.com/", - "[reference-link]: HTTP://EXAMPLE.NET/\n\ - [other-link]: HTTP://EXAMPLE.COM/", - ), - ]); - assert_normalized_messages_eq( - catalog, - &[ - exact("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."), - exact( - "foo [bar](http://example.net/) baz", - "FOO [BAR](HTTP://EXAMPLE.NET/) BAZ", - ), - exact("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."), - ], - ); - } - - #[test] - fn test_normalize_paragraphs() { - let catalog = create_catalog(&[( - "foo\n\n\ - bar", - "FOO\n\n\ - BAR", - )]); - assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]); - } - - #[test] - fn test_normalize_fuzzy_paragraphs_too_many() { - let catalog = create_catalog(&[( - "foo\n\n\ - bar", - "FOO\n\n\ - BAR\n\n\ - BAZ", - )]); - assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]); - } - - #[test] - fn test_normalize_fuzzy_paragraphs_too_few() { - let catalog = create_catalog(&[( - "foo\n\n\ - bar\n\n\ - baz", - "FOO\n\n\ - BAR", - )]); - assert_normalized_messages_eq( - catalog, - &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")], - ); - } - - #[test] - fn test_normalize_list_items() { - let catalog = create_catalog(&[( - "* foo\n\ - * bar", - "* FOO\n\ - * BAR", - )]); - assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]); - } - - #[test] - fn test_normalize_fuzzy_list_items_too_many() { - let catalog = create_catalog(&[( - "* foo\n\ - * bar", - "* FOO\n\ - * BAR\n\ - * BAZ", - )]); - assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]); - } - - #[test] - fn test_normalize_fuzzy_list_items_too_few() { - let catalog = create_catalog(&[( - "* foo\n\ - * bar\n\ - * baz", - "* FOO\n\ - * BAR", - )]); - assert_normalized_messages_eq( - catalog, - &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")], - ); - } - - #[test] - fn test_normalize_disappearing_html() { - // Normalizing "" results in no messages. - let catalog = create_catalog(&[("", "FOO")]); - assert_normalized_messages_eq(catalog, &[]); - } - - #[test] - fn test_normalize_code_blocks() { - let catalog = create_catalog(&[( - "```rust,editable\n\ - // Example\n\ - foo\n\ - \n\ - * bar\n\ - ```", - "```rust,editable\n\ - // Beispiel\n\ - FOO\n\ - \n\ - * BAR\n\ - ```", - )]); - assert_normalized_messages_eq( - catalog, - &[exact( - "```rust,editable\n\ - // Example\n\ - foo\n\ - \n\ - * bar\n\ - ```", - "```rust,editable\n\ - // Beispiel\n\ - FOO\n\ - \n\ - * BAR\n\ - ```", - )], - ); - } - - #[test] - fn test_normalize_block_quote() { - let catalog = create_catalog(&[( - "> foo bar\n\ - > baz", - "> FOO BAR\n\ - > BAZ", - )]); - assert_normalized_messages_eq(catalog, &[exact("foo bar baz", "FOO BAR BAZ")]); - } - - #[test] - fn test_normalize_block_quote_with_list() { - let catalog = create_catalog(&[( - "> * foo bar\n\ - > baz\n\ - > * quux", - "> * FOO BAR\n\ - > BAZ\n\ - > * QUUX", - )]); - assert_normalized_messages_eq( - catalog, - &[exact("foo bar baz", "FOO BAR BAZ"), exact("quux", "QUUX")], - ); - } - - #[test] - fn test_normalize_table() { - let catalog = create_catalog(&[( - "\ - | | Types |\n\ - |--------|-------------|\n\ - | Arrays | `[T; N]` |\n\ - | Tuples | `()`, ... |", - "\ - | | TYPES |\n\ - |---|---|\n\ - | ARRAYS | `[T; N]` |\n\ - | TUPLES | `()`, ... |", - )]); - assert_normalized_messages_eq( - catalog, - &[ - exact("Types", "TYPES"), - exact("Arrays", "ARRAYS"), - exact("`[T; N]`", "`[T; N]`"), - exact("Tuples", "TUPLES"), - exact("`()`, ...", "`()`, ..."), - ], - ); - } -} diff --git a/src/lib.rs b/src/lib.rs index 1b2494e3..bde7845c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,8 @@ use pulldown_cmark_to_cmark::{cmark_resume_with_options, Options, State}; use regex::Regex; use std::sync::OnceLock; +pub mod normalize; + /// Like `mdbook::utils::new_cmark_parser`, but also passes a /// `BrokenLinkCallback`. pub fn new_cmark_parser<'input, 'callback>( diff --git a/src/normalize.rs b/src/normalize.rs new file mode 100644 index 00000000..e3656190 --- /dev/null +++ b/src/normalize.rs @@ -0,0 +1,548 @@ +//! Normalize the Markdown in a a PO or POT file. +//! +//! This file contains main logic used by the binary `mdbook-i18n-normalize`. + +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; + +use super::{extract_messages, new_cmark_parser}; +use polib::catalog::Catalog; +use polib::message::{Message, MessageFlags, MessageMutView, MessageView}; +use pulldown_cmark::{Event, LinkType, Tag}; + +fn parse_source(source: &str) -> Option<(&str, usize)> { + let (path, lineno) = source.split_once(':')?; + Some((path, lineno.parse().ok()?)) +} + +fn compute_source(source: &str, delta: usize) -> String { + let mut new_source = String::with_capacity(source.len()); + + for path_lineno in source.split_whitespace() { + if !new_source.is_empty() { + new_source.push('\n'); + } + if let Some((path, lineno)) = parse_source(path_lineno) { + new_source.push_str(&format!("{path}:{}", lineno + delta)); + } else { + new_source.push_str(source); + } + } + + new_source +} + +/// Check if `text` contains one or more broken reference links. +fn has_broken_link(text: &str) -> bool { + // The return value from the callback is not important, it just + // has to return Some to generate a `LinkType::*Unknown`. + let mut callback = |_| Some(("".into(), "".into())); + new_cmark_parser(text, Some(&mut callback)).any(|event| { + matches!( + event, + Event::Start(Tag::Link( + LinkType::ReferenceUnknown | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown, + _, + _ + )) + ) + }) +} + +#[derive(Debug, Copy, Clone)] +enum MessageField { + Msgid, + Msgstr, +} + +impl MessageField { + fn project<'a>(&self, msgid: &'a str, msgstr: &'a str) -> &'a str { + match self { + MessageField::Msgid => msgid, + MessageField::Msgstr => msgstr, + } + } +} + +#[derive(Debug)] +struct SourceMap<'a> { + messages: HashMap<&'a str, Vec<(usize, &'a str, &'a str)>>, +} + +impl<'a> SourceMap<'a> { + /// Construct a map from source paths to links. + fn new(catalog: &'a Catalog) -> anyhow::Result> { + let mut messages = HashMap::<&str, Vec<_>>::new(); + for message in catalog.messages() { + let path_linenos = message + .source() + .split_whitespace() + .map(|source| parse_source(source).unwrap_or((source, 0))); + for (path, lineno) in path_linenos { + messages.entry(path).or_default().push(( + lineno, + message.msgid(), + message.msgstr().unwrap_or_default(), + )); + } + } + + for (_, value) in messages.iter_mut() { + value.sort(); + } + + Ok(SourceMap { messages }) + } + + /// Extract messages for `message`. + /// + /// Broken links are resolved using the other messages from the + /// same path in the source map. + fn extract_messages( + &self, + message: &dyn MessageView, + field: MessageField, + ) -> anyhow::Result> { + // The strategy is to parse the message alone, if possible. If + // it has a broken link, then we construct a larger document + // using all other messages with the same path. This way the + // link should be defined. + let document = field.project(message.msgid(), message.msgstr()?); + if !has_broken_link(document) { + return Ok(extract_messages(document)); + } + + // If `parse_source` fails, then `message` has more than one + // source. We won't attempt to resolve the broken link in that + // case since it is unclear which link definition to use. + let path = match parse_source(message.source()) { + Some((path, _)) => path, + None => return Ok(extract_messages(document)), + }; + + // First, we try constructing a document using other messages + // from the catalog. Catalogs from pre-0.1.0 included the link + // definitions. + // + // This will have quadratic complexity in case every message + // from `path` has a "[some text][1]" link which needs to be + // resolved using a table of link definitions as the bottom. + // However, in practice, only a few messages will have such a + // link and the whole thing seems to be fast enough. + let mut full_document = String::from(document); + for (_, msgid, msgstr) in &self.messages[path] { + let msg = field.project(msgid, msgstr); + if msg == document { + continue; + } + full_document.push_str("\n\n"); + full_document.push_str(msg); + } + + // Second, we attempt to add the original source file. + // Catalogs made with version 0.1.0 to 0.2.0 did not include + // the link definitions at all, so we need to rely on the + // source data (if we can find it). + if let Ok(mut file) = File::open(path) { + full_document.push_str("\n\n"); + let _ = file.read_to_string(&mut full_document); + } + + let mut messages = extract_messages(&full_document); + // Truncate away the messages from `full_document` which start + // after `document`. + let line_count = document.lines().count(); + if let Some(pos) = messages.iter().position(|(lineno, _)| *lineno > line_count) { + messages.truncate(pos); + } + Ok(messages) + } +} + +/// Normalize all entries in the catalog. +/// +/// Both the `msgid` and the `msgstr` fields are sent through +/// [`extract_messages`]. The resulting messages are emitted to a new +/// catalog. If the normalization produces different number of +/// messages for the `msgid` and `msgstr` fields, then the result is +/// marked fuzzy. The extra messages are dropped. +pub fn normalize(catalog: Catalog) -> anyhow::Result { + let source_map = SourceMap::new(&catalog)?; + + // Accumulate new messages here to avoid constructing a `Catalog` + // via a partial move from `catalog`. + let mut new_messages = Vec::new(); + for message in catalog.messages() { + let new_msgids = source_map.extract_messages(message, MessageField::Msgid)?; + if new_msgids.is_empty() { + // Continue if there is nothing to normalize. This can + // happen if the old `msgid` is something like "Foo" + // since we no longer extract HTML elements. + continue; + } + + let mut new_msgstrs = source_map.extract_messages(message, MessageField::Msgstr)?; + let mut flags = MessageFlags::new(); + if message.is_fuzzy() || (message.is_translated() && new_msgids.len() != new_msgstrs.len()) + { + // Keep existing fuzzy flag, or add a new one if we cannot + // split a translated message cleanly. + flags.add_flag("fuzzy"); + } + + match new_msgids.len().cmp(&new_msgstrs.len()) { + std::cmp::Ordering::Less => { + // Treat left-over translations as separate paragraphs. + // This makes normalization stable. + let tail = new_msgstrs[new_msgids.len() - 1..] + .iter() + .map(|(_, msgstr)| msgstr.as_str()) + .collect::>() + .join("\n\n"); + new_msgstrs.truncate(new_msgids.len() - 1); + new_msgstrs.push((0, tail)) + } + std::cmp::Ordering::Greater => { + // Set missing msgstr entries to "". + new_msgstrs.resize(new_msgids.len(), (0, String::new())); + } + _ => {} + } + + for ((delta, msgid), (_, msgstr)) in std::iter::zip(new_msgids, new_msgstrs) { + let new_message = Message::build_singular() + .with_source(compute_source(message.source(), delta - 1)) + .with_msgid(msgid) + .with_msgstr(msgstr) + .with_flags(flags.clone()) + .done(); + new_messages.push(new_message); + } + } + + let mut new_catalog = Catalog::new(catalog.metadata); + for new_message in new_messages { + match new_catalog.find_message_mut(None, new_message.msgid(), None) { + Some(mut message) => { + if !message.is_translated() && new_message.is_translated() { + message.set_msgstr(String::from(new_message.msgstr()?))?; + // Because we normalize messages like "# Foo" and + // "- Foo" to just "Foo", we can end up with + // duplicates. In that case, it's important to + // preserve the fuzzy flag. + if new_message.is_fuzzy() { + message.flags_mut().add_flag("fuzzy"); + } + } + message.source_mut().push('\n'); + message.source_mut().push_str(new_message.source()); + } + None => new_catalog.append_or_update(new_message), + } + } + + Ok(new_catalog) +} + +#[cfg(test)] +mod tests { + use super::*; + use polib::metadata::CatalogMetadata; + use pretty_assertions::assert_eq; + + // Create a catalog from the translation pairs given. + fn create_catalog(translations: &[(&str, &str)]) -> Catalog { + let mut catalog = Catalog::new(CatalogMetadata::new()); + for (idx, (msgid, msgstr)) in translations.iter().enumerate() { + let message = Message::build_singular() + .with_source(format!("foo.md:{idx}")) + .with_msgid(String::from(*msgid)) + .with_msgstr(String::from(*msgstr)) + .done(); + catalog.append_or_update(message); + } + catalog + } + + fn exact<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) { + (false, msgid, msgstr) + } + + fn fuzzy<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) { + (true, msgid, msgstr) + } + + #[track_caller] + fn assert_normalized_messages_eq(catalog: Catalog, expected_messages: &[(bool, &str, &str)]) { + let normalized = normalize(catalog).expect("Could not normalize"); + let messages = normalized + .messages() + .map(|msg| (msg.is_fuzzy(), msg.msgid(), msg.msgstr().unwrap())) + .collect::>(); + assert_eq!(messages, expected_messages); + } + + #[test] + fn test_normalize_untranslated() { + let catalog = create_catalog(&[("foo bar", "")]); + assert_normalized_messages_eq(catalog, &[exact("foo bar", "")]); + } + + #[test] + fn test_normalize_first_wins() { + // When two or more msgid fields are normalized the same way, + // we use the first translated entry. The other is dropped. + let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "# FOO 2")]); + assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]); + } + + #[test] + fn test_normalize_early_translation_wins() { + let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "")]); + assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]); + } + + #[test] + fn test_normalize_late_translation_wins() { + let catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]); + assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 2")]); + } + + #[test] + fn test_normalize_fuzzy_wins() { + let mut catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]); + // Make the second message fuzzy and check that this is copied + // to the normalized messages. + catalog + .messages_mut() + .nth(1) + .unwrap() + .flags_mut() + .add_flag("fuzzy"); + assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO 2")]); + } + + #[test] + fn test_normalize_softbreak() { + let catalog = create_catalog(&[("foo\nbar", "FOO\nBAR\nBAZ")]); + assert_normalized_messages_eq(catalog, &[exact("foo bar", "FOO BAR BAZ")]); + } + + #[test] + fn test_normalize_inline_link() { + let catalog = create_catalog(&[( + "foo [bar](http://example.net/) baz", + "FOO [BAR](http://example.net/) BAZ", + )]); + assert_normalized_messages_eq( + catalog, + &[exact( + "foo [bar](http://example.net/) baz", + "FOO [BAR](http://example.net/) BAZ", + )], + ); + } + + #[test] + fn test_normalize_reference_link() { + // Check that we can normalize a reference link when its link + // definition is in a different message. + let catalog = create_catalog(&[ + ("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."), + ( + "foo [bar][reference-link] baz", + "FOO [BAR][reference-link] BAZ", + ), + ("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."), + ( + "[reference-link]: http://example.net/\n\ + [other-link]: http://example.com/", + "[reference-link]: HTTP://EXAMPLE.NET/\n\ + [other-link]: HTTP://EXAMPLE.COM/", + ), + ]); + assert_normalized_messages_eq( + catalog, + &[ + exact("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."), + exact( + "foo [bar](http://example.net/) baz", + "FOO [BAR](HTTP://EXAMPLE.NET/) BAZ", + ), + exact("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."), + ], + ); + } + + #[test] + fn test_normalize_paragraphs() { + let catalog = create_catalog(&[( + "foo\n\n\ + bar", + "FOO\n\n\ + BAR", + )]); + assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]); + } + + #[test] + fn test_normalize_fuzzy_paragraphs_too_many() { + let catalog = create_catalog(&[( + "foo\n\n\ + bar", + "FOO\n\n\ + BAR\n\n\ + BAZ", + )]); + assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]); + } + + #[test] + fn test_normalize_fuzzy_paragraphs_too_few() { + let catalog = create_catalog(&[( + "foo\n\n\ + bar\n\n\ + baz", + "FOO\n\n\ + BAR", + )]); + assert_normalized_messages_eq( + catalog, + &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")], + ); + } + + #[test] + fn test_normalize_list_items() { + let catalog = create_catalog(&[( + "* foo\n\ + * bar", + "* FOO\n\ + * BAR", + )]); + assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]); + } + + #[test] + fn test_normalize_fuzzy_list_items_too_many() { + let catalog = create_catalog(&[( + "* foo\n\ + * bar", + "* FOO\n\ + * BAR\n\ + * BAZ", + )]); + assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]); + } + + #[test] + fn test_normalize_fuzzy_list_items_too_few() { + let catalog = create_catalog(&[( + "* foo\n\ + * bar\n\ + * baz", + "* FOO\n\ + * BAR", + )]); + assert_normalized_messages_eq( + catalog, + &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")], + ); + } + + #[test] + fn test_normalize_disappearing_html() { + // Normalizing "" results in no messages. + let catalog = create_catalog(&[("", "FOO")]); + assert_normalized_messages_eq(catalog, &[]); + } + + #[test] + fn test_normalize_code_blocks() { + let catalog = create_catalog(&[( + "```rust,editable\n\ + // Example\n\ + foo\n\ + \n\ + * bar\n\ + ```", + "```rust,editable\n\ + // Beispiel\n\ + FOO\n\ + \n\ + * BAR\n\ + ```", + )]); + assert_normalized_messages_eq( + catalog, + &[exact( + "```rust,editable\n\ + // Example\n\ + foo\n\ + \n\ + * bar\n\ + ```", + "```rust,editable\n\ + // Beispiel\n\ + FOO\n\ + \n\ + * BAR\n\ + ```", + )], + ); + } + + #[test] + fn test_normalize_block_quote() { + let catalog = create_catalog(&[( + "> foo bar\n\ + > baz", + "> FOO BAR\n\ + > BAZ", + )]); + assert_normalized_messages_eq(catalog, &[exact("foo bar baz", "FOO BAR BAZ")]); + } + + #[test] + fn test_normalize_block_quote_with_list() { + let catalog = create_catalog(&[( + "> * foo bar\n\ + > baz\n\ + > * quux", + "> * FOO BAR\n\ + > BAZ\n\ + > * QUUX", + )]); + assert_normalized_messages_eq( + catalog, + &[exact("foo bar baz", "FOO BAR BAZ"), exact("quux", "QUUX")], + ); + } + + #[test] + fn test_normalize_table() { + let catalog = create_catalog(&[( + "\ + | | Types |\n\ + |--------|-------------|\n\ + | Arrays | `[T; N]` |\n\ + | Tuples | `()`, ... |", + "\ + | | TYPES |\n\ + |---|---|\n\ + | ARRAYS | `[T; N]` |\n\ + | TUPLES | `()`, ... |", + )]); + assert_normalized_messages_eq( + catalog, + &[ + exact("Types", "TYPES"), + exact("Arrays", "ARRAYS"), + exact("`[T; N]`", "`[T; N]`"), + exact("Tuples", "TUPLES"), + exact("`()`, ...", "`()`, ..."), + ], + ); + } +} From 0e54cde2619ac73fc10caa4392a9585994e96894 Mon Sep 17 00:00:00 2001 From: Darkhan Kubigenov Date: Fri, 1 Sep 2023 13:57:00 +0000 Subject: [PATCH 2/4] Add new fuzz target `normalize` I tested this fuzz target with changes from https://github.com/google/mdbook-i18n-helpers/pull/56 reverted and it does detect the panic. --- fuzz/Cargo.lock | 22 ++++++++++++---------- fuzz/Cargo.toml | 7 +++++++ fuzz/fuzz_targets/normalize.rs | 25 +++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 fuzz/fuzz_targets/normalize.rs diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index ce77e644..157488d5 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -483,13 +483,14 @@ dependencies = [ [[package]] name = "mdbook-i18n-helpers" -version = "0.1.0" +version = "0.2.2" dependencies = [ "anyhow", "mdbook", "polib", "pulldown-cmark", "pulldown-cmark-to-cmark", + "regex", "semver", "serde_json", ] @@ -500,14 +501,15 @@ version = "0.0.0" dependencies = [ "libfuzzer-sys", "mdbook-i18n-helpers", + "polib", "pretty_assertions", ] [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" [[package]] name = "num-traits" @@ -657,13 +659,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.0" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.0", + "regex-automata 0.3.8", "regex-syntax", ] @@ -675,9 +677,9 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-automata" -version = "0.3.0" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" dependencies = [ "aho-corasick", "memchr", @@ -686,9 +688,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.3" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "rustix" diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index d6c68f1b..f1d3322b 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -9,6 +9,7 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" +polib = "0.2.0" pretty_assertions = "1.3.0" [dependencies.mdbook-i18n-helpers] @@ -26,3 +27,9 @@ name = "group_events" path = "fuzz_targets/group_events.rs" test = false doc = false + +[[bin]] +name = "normalize" +path = "fuzz_targets/normalize.rs" +test = false +doc = false diff --git a/fuzz/fuzz_targets/normalize.rs b/fuzz/fuzz_targets/normalize.rs new file mode 100644 index 00000000..51128ebc --- /dev/null +++ b/fuzz/fuzz_targets/normalize.rs @@ -0,0 +1,25 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mdbook_i18n_helpers::normalize::normalize; +use polib::catalog::Catalog; +use polib::message::Message; +use polib::metadata::CatalogMetadata; + +fuzz_target!(|translations: Vec<(&str, &str)>| { + let catalog = create_catalog(translations); + let _ = normalize(catalog); +}); + +fn create_catalog(translations: Vec<(&str, &str)>) -> Catalog { + let mut catalog = Catalog::new(CatalogMetadata::new()); + for (idx, (msgid, msgstr)) in translations.iter().enumerate() { + let message = Message::build_singular() + .with_source(format!("foo.md:{idx}")) + .with_msgid(String::from(*msgid)) + .with_msgstr(String::from(*msgstr)) + .done(); + catalog.append_or_update(message); + } + catalog +} From 8fdf4be233443f2041f2bf311221e78c2668c925 Mon Sep 17 00:00:00 2001 From: Darkhan Kubigenov Date: Fri, 15 Sep 2023 17:36:32 +0000 Subject: [PATCH 3/4] Update github workflow to run the new fuzzer normalize --- .github/workflows/test.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 05dde6a2..36afd900 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,11 +48,15 @@ jobs: restore-keys: | fuzz-corpus - - name: Run fuzz test - run: cargo fuzz run group_events -- -only_ascii=1 -max_total_time=30 - - - name: Minimize fuzz corpus - run: cargo fuzz cmin group_events + - name: Run group_events fuzzer and minimize corpus + run: | + cargo fuzz run group_events -- -only_ascii=1 -max_total_time=30 + cargo fuzz cmin group_events + + - name: Run normalize fuzzer and minimize corpus + run: | + cargo fuzz run normalize -- -only_ascii=1 -max_total_time=30 + cargo fuzz cmin normalize clippy: name: Clippy From c77777c46443e766bd4f484fd7ab2b5abca23aab Mon Sep 17 00:00:00 2001 From: Martin Geisler Date: Sat, 16 Sep 2023 16:27:21 +0200 Subject: [PATCH 4/4] Fix typo in src/normalize.rs --- src/normalize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/normalize.rs b/src/normalize.rs index e3656190..009a69a0 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -1,4 +1,4 @@ -//! Normalize the Markdown in a a PO or POT file. +//! Normalize the Markdown in a PO or POT file. //! //! This file contains main logic used by the binary `mdbook-i18n-normalize`.