From 788e3f33315d59edc3a781712483ddc646a3e7af Mon Sep 17 00:00:00 2001
From: Darkhan Kubigenov <kdark@google.com>
Date: Fri, 15 Sep 2023 15:36:36 +0000
Subject: [PATCH 1/4] Move the code from binary mdbook-i18n-normalize.rs into
 library code

This refactoring is needed so that fuzzing code can
import and test this logic.
---
 src/bin/mdbook-i18n-normalize.rs | 544 +-----------------------------
 src/lib.rs                       |   2 +
 src/normalize.rs                 | 548 +++++++++++++++++++++++++++++++
 3 files changed, 551 insertions(+), 543 deletions(-)
 create mode 100644 src/normalize.rs

diff --git a/src/bin/mdbook-i18n-normalize.rs b/src/bin/mdbook-i18n-normalize.rs
index a48641e7..b3be34f5 100644
--- a/src/bin/mdbook-i18n-normalize.rs
+++ b/src/bin/mdbook-i18n-normalize.rs
@@ -10,251 +10,11 @@
 //! safely move to a new version of the mdbook-i18n-helpers without
 //! losing existing translations.
 
-use std::collections::HashMap;
-use std::fs::File;
-use std::io::Read;
 use std::path::Path;
 
 use anyhow::{bail, Context};
-use mdbook_i18n_helpers::{extract_messages, new_cmark_parser};
-use polib::catalog::Catalog;
-use polib::message::{Message, MessageFlags, MessageMutView, MessageView};
+use mdbook_i18n_helpers::normalize::normalize;
 use polib::po_file;
-use pulldown_cmark::{Event, LinkType, Tag};
-
-fn parse_source(source: &str) -> Option<(&str, usize)> {
-    let (path, lineno) = source.split_once(':')?;
-    Some((path, lineno.parse().ok()?))
-}
-
-fn compute_source(source: &str, delta: usize) -> String {
-    let mut new_source = String::with_capacity(source.len());
-
-    for path_lineno in source.split_whitespace() {
-        if !new_source.is_empty() {
-            new_source.push('\n');
-        }
-        if let Some((path, lineno)) = parse_source(path_lineno) {
-            new_source.push_str(&format!("{path}:{}", lineno + delta));
-        } else {
-            new_source.push_str(source);
-        }
-    }
-
-    new_source
-}
-
-/// Check if `text` contains one or more broken reference links.
-fn has_broken_link(text: &str) -> bool {
-    // The return value from the callback is not important, it just
-    // has to return Some to generate a `LinkType::*Unknown`.
-    let mut callback = |_| Some(("".into(), "".into()));
-    new_cmark_parser(text, Some(&mut callback)).any(|event| {
-        matches!(
-            event,
-            Event::Start(Tag::Link(
-                LinkType::ReferenceUnknown | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown,
-                _,
-                _
-            ))
-        )
-    })
-}
-
-#[derive(Debug, Copy, Clone)]
-enum MessageField {
-    Msgid,
-    Msgstr,
-}
-
-impl MessageField {
-    fn project<'a>(&self, msgid: &'a str, msgstr: &'a str) -> &'a str {
-        match self {
-            MessageField::Msgid => msgid,
-            MessageField::Msgstr => msgstr,
-        }
-    }
-}
-
-#[derive(Debug)]
-struct SourceMap<'a> {
-    messages: HashMap<&'a str, Vec<(usize, &'a str, &'a str)>>,
-}
-
-impl<'a> SourceMap<'a> {
-    /// Construct a map from source paths to links.
-    fn new(catalog: &'a Catalog) -> anyhow::Result<SourceMap<'a>> {
-        let mut messages = HashMap::<&str, Vec<_>>::new();
-        for message in catalog.messages() {
-            let path_linenos = message
-                .source()
-                .split_whitespace()
-                .map(|source| parse_source(source).unwrap_or((source, 0)));
-            for (path, lineno) in path_linenos {
-                messages.entry(path).or_default().push((
-                    lineno,
-                    message.msgid(),
-                    message.msgstr().unwrap_or_default(),
-                ));
-            }
-        }
-
-        for (_, value) in messages.iter_mut() {
-            value.sort();
-        }
-
-        Ok(SourceMap { messages })
-    }
-
-    /// Extract messages for `message`.
-    ///
-    /// Broken links are resolved using the other messages from the
-    /// same path in the source map.
-    fn extract_messages(
-        &self,
-        message: &dyn MessageView,
-        field: MessageField,
-    ) -> anyhow::Result<Vec<(usize, String)>> {
-        // The strategy is to parse the message alone, if possible. If
-        // it has a broken link, then we construct a larger document
-        // using all other messages with the same path. This way the
-        // link should be defined.
-        let document = field.project(message.msgid(), message.msgstr()?);
-        if !has_broken_link(document) {
-            return Ok(extract_messages(document));
-        }
-
-        // If `parse_source` fails, then `message` has more than one
-        // source. We won't attempt to resolve the broken link in that
-        // case since it is unclear which link definition to use.
-        let path = match parse_source(message.source()) {
-            Some((path, _)) => path,
-            None => return Ok(extract_messages(document)),
-        };
-
-        // First, we try constructing a document using other messages
-        // from the catalog. Catalogs from pre-0.1.0 included the link
-        // definitions.
-        //
-        // This will have quadratic complexity in case every message
-        // from `path` has a "[some text][1]" link which needs to be
-        // resolved using a table of link definitions as the bottom.
-        // However, in practice, only a few messages will have such a
-        // link and the whole thing seems to be fast enough.
-        let mut full_document = String::from(document);
-        for (_, msgid, msgstr) in &self.messages[path] {
-            let msg = field.project(msgid, msgstr);
-            if msg == document {
-                continue;
-            }
-            full_document.push_str("\n\n");
-            full_document.push_str(msg);
-        }
-
-        // Second, we attempt to add the original source file.
-        // Catalogs made with version 0.1.0 to 0.2.0 did not include
-        // the link definitions at all, so we need to rely on the
-        // source data (if we can find it).
-        if let Ok(mut file) = File::open(path) {
-            full_document.push_str("\n\n");
-            let _ = file.read_to_string(&mut full_document);
-        }
-
-        let mut messages = extract_messages(&full_document);
-        // Truncate away the messages from `full_document` which start
-        // after `document`.
-        let line_count = document.lines().count();
-        if let Some(pos) = messages.iter().position(|(lineno, _)| *lineno > line_count) {
-            messages.truncate(pos);
-        }
-        Ok(messages)
-    }
-}
-
-/// Normalize all entries in the catalog.
-///
-/// Both the `msgid` and the `msgstr` fields are sent through
-/// [`extract_messages`]. The resulting messages are emitted to a new
-/// catalog. If the normalization produces different number of
-/// messages for the `msgid` and `msgstr` fields, then the result is
-/// marked fuzzy. The extra messages are dropped.
-pub fn normalize(catalog: Catalog) -> anyhow::Result<Catalog> {
-    let source_map = SourceMap::new(&catalog)?;
-
-    // Accumulate new messages here to avoid constructing a `Catalog`
-    // via a partial move from `catalog`.
-    let mut new_messages = Vec::new();
-    for message in catalog.messages() {
-        let new_msgids = source_map.extract_messages(message, MessageField::Msgid)?;
-        if new_msgids.is_empty() {
-            // Continue if there is nothing to normalize. This can
-            // happen if the old `msgid` is something like "<b>Foo"
-            // since we no longer extract HTML elements.
-            continue;
-        }
-
-        let mut new_msgstrs = source_map.extract_messages(message, MessageField::Msgstr)?;
-        let mut flags = MessageFlags::new();
-        if message.is_fuzzy() || (message.is_translated() && new_msgids.len() != new_msgstrs.len())
-        {
-            // Keep existing fuzzy flag, or add a new one if we cannot
-            // split a translated message cleanly.
-            flags.add_flag("fuzzy");
-        }
-
-        match new_msgids.len().cmp(&new_msgstrs.len()) {
-            std::cmp::Ordering::Less => {
-                // Treat left-over translations as separate paragraphs.
-                // This makes normalization stable.
-                let tail = new_msgstrs[new_msgids.len() - 1..]
-                    .iter()
-                    .map(|(_, msgstr)| msgstr.as_str())
-                    .collect::<Vec<_>>()
-                    .join("\n\n");
-                new_msgstrs.truncate(new_msgids.len() - 1);
-                new_msgstrs.push((0, tail))
-            }
-            std::cmp::Ordering::Greater => {
-                // Set missing msgstr entries to "".
-                new_msgstrs.resize(new_msgids.len(), (0, String::new()));
-            }
-            _ => {}
-        }
-
-        for ((delta, msgid), (_, msgstr)) in std::iter::zip(new_msgids, new_msgstrs) {
-            let new_message = Message::build_singular()
-                .with_source(compute_source(message.source(), delta - 1))
-                .with_msgid(msgid)
-                .with_msgstr(msgstr)
-                .with_flags(flags.clone())
-                .done();
-            new_messages.push(new_message);
-        }
-    }
-
-    let mut new_catalog = Catalog::new(catalog.metadata);
-    for new_message in new_messages {
-        match new_catalog.find_message_mut(None, new_message.msgid(), None) {
-            Some(mut message) => {
-                if !message.is_translated() && new_message.is_translated() {
-                    message.set_msgstr(String::from(new_message.msgstr()?))?;
-                    // Because we normalize messages like "# Foo" and
-                    // "- Foo" to just "Foo", we can end up with
-                    // duplicates. In that case, it's important to
-                    // preserve the fuzzy flag.
-                    if new_message.is_fuzzy() {
-                        message.flags_mut().add_flag("fuzzy");
-                    }
-                }
-                message.source_mut().push('\n');
-                message.source_mut().push_str(new_message.source());
-            }
-            None => new_catalog.append_or_update(new_message),
-        }
-    }
-
-    Ok(new_catalog)
-}
 
 fn main() -> anyhow::Result<()> {
     let args = std::env::args().collect::<Vec<_>>();
@@ -272,305 +32,3 @@ fn main() -> anyhow::Result<()> {
 
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use polib::metadata::CatalogMetadata;
-    use pretty_assertions::assert_eq;
-
-    // Create a catalog from the translation pairs given.
-    fn create_catalog(translations: &[(&str, &str)]) -> Catalog {
-        let mut catalog = Catalog::new(CatalogMetadata::new());
-        for (idx, (msgid, msgstr)) in translations.iter().enumerate() {
-            let message = Message::build_singular()
-                .with_source(format!("foo.md:{idx}"))
-                .with_msgid(String::from(*msgid))
-                .with_msgstr(String::from(*msgstr))
-                .done();
-            catalog.append_or_update(message);
-        }
-        catalog
-    }
-
-    fn exact<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) {
-        (false, msgid, msgstr)
-    }
-
-    fn fuzzy<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) {
-        (true, msgid, msgstr)
-    }
-
-    #[track_caller]
-    fn assert_normalized_messages_eq(catalog: Catalog, expected_messages: &[(bool, &str, &str)]) {
-        let normalized = normalize(catalog).expect("Could not normalize");
-        let messages = normalized
-            .messages()
-            .map(|msg| (msg.is_fuzzy(), msg.msgid(), msg.msgstr().unwrap()))
-            .collect::<Vec<(bool, &str, &str)>>();
-        assert_eq!(messages, expected_messages);
-    }
-
-    #[test]
-    fn test_normalize_untranslated() {
-        let catalog = create_catalog(&[("foo bar", "")]);
-        assert_normalized_messages_eq(catalog, &[exact("foo bar", "")]);
-    }
-
-    #[test]
-    fn test_normalize_first_wins() {
-        // When two or more msgid fields are normalized the same way,
-        // we use the first translated entry. The other is dropped.
-        let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "# FOO 2")]);
-        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]);
-    }
-
-    #[test]
-    fn test_normalize_early_translation_wins() {
-        let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "")]);
-        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]);
-    }
-
-    #[test]
-    fn test_normalize_late_translation_wins() {
-        let catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]);
-        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 2")]);
-    }
-
-    #[test]
-    fn test_normalize_fuzzy_wins() {
-        let mut catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]);
-        // Make the second message fuzzy and check that this is copied
-        // to the normalized messages.
-        catalog
-            .messages_mut()
-            .nth(1)
-            .unwrap()
-            .flags_mut()
-            .add_flag("fuzzy");
-        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO 2")]);
-    }
-
-    #[test]
-    fn test_normalize_softbreak() {
-        let catalog = create_catalog(&[("foo\nbar", "FOO\nBAR\nBAZ")]);
-        assert_normalized_messages_eq(catalog, &[exact("foo bar", "FOO BAR BAZ")]);
-    }
-
-    #[test]
-    fn test_normalize_inline_link() {
-        let catalog = create_catalog(&[(
-            "foo [bar](http://example.net/) baz",
-            "FOO [BAR](http://example.net/) BAZ",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[exact(
-                "foo [bar](http://example.net/) baz",
-                "FOO [BAR](http://example.net/) BAZ",
-            )],
-        );
-    }
-
-    #[test]
-    fn test_normalize_reference_link() {
-        // Check that we can normalize a reference link when its link
-        // definition is in a different message.
-        let catalog = create_catalog(&[
-            ("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."),
-            (
-                "foo [bar][reference-link] baz",
-                "FOO [BAR][reference-link] BAZ",
-            ),
-            ("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."),
-            (
-                "[reference-link]: http://example.net/\n\
-                 [other-link]: http://example.com/",
-                "[reference-link]: HTTP://EXAMPLE.NET/\n\
-                 [other-link]: HTTP://EXAMPLE.COM/",
-            ),
-        ]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[
-                exact("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."),
-                exact(
-                    "foo [bar](http://example.net/) baz",
-                    "FOO [BAR](HTTP://EXAMPLE.NET/) BAZ",
-                ),
-                exact("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_normalize_paragraphs() {
-        let catalog = create_catalog(&[(
-            "foo\n\n\
-             bar",
-            "FOO\n\n\
-             BAR",
-        )]);
-        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]);
-    }
-
-    #[test]
-    fn test_normalize_fuzzy_paragraphs_too_many() {
-        let catalog = create_catalog(&[(
-            "foo\n\n\
-             bar",
-            "FOO\n\n\
-             BAR\n\n\
-             BAZ",
-        )]);
-        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]);
-    }
-
-    #[test]
-    fn test_normalize_fuzzy_paragraphs_too_few() {
-        let catalog = create_catalog(&[(
-            "foo\n\n\
-             bar\n\n\
-             baz",
-            "FOO\n\n\
-             BAR",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")],
-        );
-    }
-
-    #[test]
-    fn test_normalize_list_items() {
-        let catalog = create_catalog(&[(
-            "* foo\n\
-             * bar",
-            "* FOO\n\
-             * BAR",
-        )]);
-        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]);
-    }
-
-    #[test]
-    fn test_normalize_fuzzy_list_items_too_many() {
-        let catalog = create_catalog(&[(
-            "* foo\n\
-             * bar",
-            "* FOO\n\
-             * BAR\n\
-             * BAZ",
-        )]);
-        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]);
-    }
-
-    #[test]
-    fn test_normalize_fuzzy_list_items_too_few() {
-        let catalog = create_catalog(&[(
-            "* foo\n\
-             * bar\n\
-             * baz",
-            "* FOO\n\
-             * BAR",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")],
-        );
-    }
-
-    #[test]
-    fn test_normalize_disappearing_html() {
-        // Normalizing "<b>" results in no messages.
-        let catalog = create_catalog(&[("<b>", "FOO")]);
-        assert_normalized_messages_eq(catalog, &[]);
-    }
-
-    #[test]
-    fn test_normalize_code_blocks() {
-        let catalog = create_catalog(&[(
-            "```rust,editable\n\
-             // Example\n\
-             foo\n\
-             \n\
-             * bar\n\
-             ```",
-            "```rust,editable\n\
-             // Beispiel\n\
-             FOO\n\
-             \n\
-             * BAR\n\
-             ```",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[exact(
-                "```rust,editable\n\
-                 // Example\n\
-                 foo\n\
-                 \n\
-                 * bar\n\
-                 ```",
-                "```rust,editable\n\
-                 // Beispiel\n\
-                 FOO\n\
-                 \n\
-                 * BAR\n\
-                 ```",
-            )],
-        );
-    }
-
-    #[test]
-    fn test_normalize_block_quote() {
-        let catalog = create_catalog(&[(
-            "> foo bar\n\
-             > baz",
-            "> FOO BAR\n\
-             > BAZ",
-        )]);
-        assert_normalized_messages_eq(catalog, &[exact("foo bar baz", "FOO BAR BAZ")]);
-    }
-
-    #[test]
-    fn test_normalize_block_quote_with_list() {
-        let catalog = create_catalog(&[(
-            "> * foo bar\n\
-             >   baz\n\
-             > * quux",
-            "> * FOO BAR\n\
-             >   BAZ\n\
-             > * QUUX",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[exact("foo bar baz", "FOO BAR BAZ"), exact("quux", "QUUX")],
-        );
-    }
-
-    #[test]
-    fn test_normalize_table() {
-        let catalog = create_catalog(&[(
-            "\
-            |        | Types       |\n\
-            |--------|-------------|\n\
-            | Arrays | `[T; N]`    |\n\
-            | Tuples | `()`, ...   |",
-            "\
-            |   | TYPES |\n\
-            |---|---|\n\
-            | ARRAYS | `[T; N]`  |\n\
-            | TUPLES | `()`, ... |",
-        )]);
-        assert_normalized_messages_eq(
-            catalog,
-            &[
-                exact("Types", "TYPES"),
-                exact("Arrays", "ARRAYS"),
-                exact("`[T; N]`", "`[T; N]`"),
-                exact("Tuples", "TUPLES"),
-                exact("`()`, ...", "`()`, ..."),
-            ],
-        );
-    }
-}
diff --git a/src/lib.rs b/src/lib.rs
index 1b2494e3..bde7845c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,8 @@ use pulldown_cmark_to_cmark::{cmark_resume_with_options, Options, State};
 use regex::Regex;
 use std::sync::OnceLock;
 
+pub mod normalize;
+
 /// Like `mdbook::utils::new_cmark_parser`, but also passes a
 /// `BrokenLinkCallback`.
 pub fn new_cmark_parser<'input, 'callback>(
diff --git a/src/normalize.rs b/src/normalize.rs
new file mode 100644
index 00000000..e3656190
--- /dev/null
+++ b/src/normalize.rs
@@ -0,0 +1,548 @@
+//! Normalize the Markdown in a  a PO or POT file.
+//!
+//! This file contains main logic used by the binary `mdbook-i18n-normalize`.
+
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+
+use super::{extract_messages, new_cmark_parser};
+use polib::catalog::Catalog;
+use polib::message::{Message, MessageFlags, MessageMutView, MessageView};
+use pulldown_cmark::{Event, LinkType, Tag};
+
+fn parse_source(source: &str) -> Option<(&str, usize)> {
+    let (path, lineno) = source.split_once(':')?;
+    Some((path, lineno.parse().ok()?))
+}
+
+fn compute_source(source: &str, delta: usize) -> String {
+    let mut new_source = String::with_capacity(source.len());
+
+    for path_lineno in source.split_whitespace() {
+        if !new_source.is_empty() {
+            new_source.push('\n');
+        }
+        if let Some((path, lineno)) = parse_source(path_lineno) {
+            new_source.push_str(&format!("{path}:{}", lineno + delta));
+        } else {
+            new_source.push_str(source);
+        }
+    }
+
+    new_source
+}
+
+/// Check if `text` contains one or more broken reference links.
+fn has_broken_link(text: &str) -> bool {
+    // The return value from the callback is not important, it just
+    // has to return Some to generate a `LinkType::*Unknown`.
+    let mut callback = |_| Some(("".into(), "".into()));
+    new_cmark_parser(text, Some(&mut callback)).any(|event| {
+        matches!(
+            event,
+            Event::Start(Tag::Link(
+                LinkType::ReferenceUnknown | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown,
+                _,
+                _
+            ))
+        )
+    })
+}
+
+#[derive(Debug, Copy, Clone)]
+enum MessageField {
+    Msgid,
+    Msgstr,
+}
+
+impl MessageField {
+    fn project<'a>(&self, msgid: &'a str, msgstr: &'a str) -> &'a str {
+        match self {
+            MessageField::Msgid => msgid,
+            MessageField::Msgstr => msgstr,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SourceMap<'a> {
+    messages: HashMap<&'a str, Vec<(usize, &'a str, &'a str)>>,
+}
+
+impl<'a> SourceMap<'a> {
+    /// Construct a map from source paths to links.
+    fn new(catalog: &'a Catalog) -> anyhow::Result<SourceMap<'a>> {
+        let mut messages = HashMap::<&str, Vec<_>>::new();
+        for message in catalog.messages() {
+            let path_linenos = message
+                .source()
+                .split_whitespace()
+                .map(|source| parse_source(source).unwrap_or((source, 0)));
+            for (path, lineno) in path_linenos {
+                messages.entry(path).or_default().push((
+                    lineno,
+                    message.msgid(),
+                    message.msgstr().unwrap_or_default(),
+                ));
+            }
+        }
+
+        for (_, value) in messages.iter_mut() {
+            value.sort();
+        }
+
+        Ok(SourceMap { messages })
+    }
+
+    /// Extract messages for `message`.
+    ///
+    /// Broken links are resolved using the other messages from the
+    /// same path in the source map.
+    fn extract_messages(
+        &self,
+        message: &dyn MessageView,
+        field: MessageField,
+    ) -> anyhow::Result<Vec<(usize, String)>> {
+        // The strategy is to parse the message alone, if possible. If
+        // it has a broken link, then we construct a larger document
+        // using all other messages with the same path. This way the
+        // link should be defined.
+        let document = field.project(message.msgid(), message.msgstr()?);
+        if !has_broken_link(document) {
+            return Ok(extract_messages(document));
+        }
+
+        // If `parse_source` fails, then `message` has more than one
+        // source. We won't attempt to resolve the broken link in that
+        // case since it is unclear which link definition to use.
+        let path = match parse_source(message.source()) {
+            Some((path, _)) => path,
+            None => return Ok(extract_messages(document)),
+        };
+
+        // First, we try constructing a document using other messages
+        // from the catalog. Catalogs from pre-0.1.0 included the link
+        // definitions.
+        //
+        // This will have quadratic complexity in case every message
+        // from `path` has a "[some text][1]" link which needs to be
+        // resolved using a table of link definitions as the bottom.
+        // However, in practice, only a few messages will have such a
+        // link and the whole thing seems to be fast enough.
+        let mut full_document = String::from(document);
+        for (_, msgid, msgstr) in &self.messages[path] {
+            let msg = field.project(msgid, msgstr);
+            if msg == document {
+                continue;
+            }
+            full_document.push_str("\n\n");
+            full_document.push_str(msg);
+        }
+
+        // Second, we attempt to add the original source file.
+        // Catalogs made with version 0.1.0 to 0.2.0 did not include
+        // the link definitions at all, so we need to rely on the
+        // source data (if we can find it).
+        if let Ok(mut file) = File::open(path) {
+            full_document.push_str("\n\n");
+            let _ = file.read_to_string(&mut full_document);
+        }
+
+        let mut messages = extract_messages(&full_document);
+        // Truncate away the messages from `full_document` which start
+        // after `document`.
+        let line_count = document.lines().count();
+        if let Some(pos) = messages.iter().position(|(lineno, _)| *lineno > line_count) {
+            messages.truncate(pos);
+        }
+        Ok(messages)
+    }
+}
+
+/// Normalize all entries in the catalog.
+///
+/// Both the `msgid` and the `msgstr` fields are sent through
+/// [`extract_messages`]. The resulting messages are emitted to a new
+/// catalog. If the normalization produces different number of
+/// messages for the `msgid` and `msgstr` fields, then the result is
+/// marked fuzzy. The extra messages are dropped.
+pub fn normalize(catalog: Catalog) -> anyhow::Result<Catalog> {
+    let source_map = SourceMap::new(&catalog)?;
+
+    // Accumulate new messages here to avoid constructing a `Catalog`
+    // via a partial move from `catalog`.
+    let mut new_messages = Vec::new();
+    for message in catalog.messages() {
+        let new_msgids = source_map.extract_messages(message, MessageField::Msgid)?;
+        if new_msgids.is_empty() {
+            // Continue if there is nothing to normalize. This can
+            // happen if the old `msgid` is something like "<b>Foo"
+            // since we no longer extract HTML elements.
+            continue;
+        }
+
+        let mut new_msgstrs = source_map.extract_messages(message, MessageField::Msgstr)?;
+        let mut flags = MessageFlags::new();
+        if message.is_fuzzy() || (message.is_translated() && new_msgids.len() != new_msgstrs.len())
+        {
+            // Keep existing fuzzy flag, or add a new one if we cannot
+            // split a translated message cleanly.
+            flags.add_flag("fuzzy");
+        }
+
+        match new_msgids.len().cmp(&new_msgstrs.len()) {
+            std::cmp::Ordering::Less => {
+                // Treat left-over translations as separate paragraphs.
+                // This makes normalization stable.
+                let tail = new_msgstrs[new_msgids.len() - 1..]
+                    .iter()
+                    .map(|(_, msgstr)| msgstr.as_str())
+                    .collect::<Vec<_>>()
+                    .join("\n\n");
+                new_msgstrs.truncate(new_msgids.len() - 1);
+                new_msgstrs.push((0, tail))
+            }
+            std::cmp::Ordering::Greater => {
+                // Set missing msgstr entries to "".
+                new_msgstrs.resize(new_msgids.len(), (0, String::new()));
+            }
+            _ => {}
+        }
+
+        for ((delta, msgid), (_, msgstr)) in std::iter::zip(new_msgids, new_msgstrs) {
+            let new_message = Message::build_singular()
+                .with_source(compute_source(message.source(), delta - 1))
+                .with_msgid(msgid)
+                .with_msgstr(msgstr)
+                .with_flags(flags.clone())
+                .done();
+            new_messages.push(new_message);
+        }
+    }
+
+    let mut new_catalog = Catalog::new(catalog.metadata);
+    for new_message in new_messages {
+        match new_catalog.find_message_mut(None, new_message.msgid(), None) {
+            Some(mut message) => {
+                if !message.is_translated() && new_message.is_translated() {
+                    message.set_msgstr(String::from(new_message.msgstr()?))?;
+                    // Because we normalize messages like "# Foo" and
+                    // "- Foo" to just "Foo", we can end up with
+                    // duplicates. In that case, it's important to
+                    // preserve the fuzzy flag.
+                    if new_message.is_fuzzy() {
+                        message.flags_mut().add_flag("fuzzy");
+                    }
+                }
+                message.source_mut().push('\n');
+                message.source_mut().push_str(new_message.source());
+            }
+            None => new_catalog.append_or_update(new_message),
+        }
+    }
+
+    Ok(new_catalog)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use polib::metadata::CatalogMetadata;
+    use pretty_assertions::assert_eq;
+
+    // Create a catalog from the translation pairs given.
+    fn create_catalog(translations: &[(&str, &str)]) -> Catalog {
+        let mut catalog = Catalog::new(CatalogMetadata::new());
+        for (idx, (msgid, msgstr)) in translations.iter().enumerate() {
+            let message = Message::build_singular()
+                .with_source(format!("foo.md:{idx}"))
+                .with_msgid(String::from(*msgid))
+                .with_msgstr(String::from(*msgstr))
+                .done();
+            catalog.append_or_update(message);
+        }
+        catalog
+    }
+
+    fn exact<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) {
+        (false, msgid, msgstr)
+    }
+
+    fn fuzzy<'a>(msgid: &'a str, msgstr: &'a str) -> (bool, &'a str, &'a str) {
+        (true, msgid, msgstr)
+    }
+
+    #[track_caller]
+    fn assert_normalized_messages_eq(catalog: Catalog, expected_messages: &[(bool, &str, &str)]) {
+        let normalized = normalize(catalog).expect("Could not normalize");
+        let messages = normalized
+            .messages()
+            .map(|msg| (msg.is_fuzzy(), msg.msgid(), msg.msgstr().unwrap()))
+            .collect::<Vec<(bool, &str, &str)>>();
+        assert_eq!(messages, expected_messages);
+    }
+
+    #[test]
+    fn test_normalize_untranslated() {
+        let catalog = create_catalog(&[("foo bar", "")]);
+        assert_normalized_messages_eq(catalog, &[exact("foo bar", "")]);
+    }
+
+    #[test]
+    fn test_normalize_first_wins() {
+        // When two or more msgid fields are normalized the same way,
+        // we use the first translated entry. The other is dropped.
+        let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "# FOO 2")]);
+        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]);
+    }
+
+    #[test]
+    fn test_normalize_early_translation_wins() {
+        let catalog = create_catalog(&[("foo", "FOO 1"), ("# foo", "")]);
+        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 1")]);
+    }
+
+    #[test]
+    fn test_normalize_late_translation_wins() {
+        let catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]);
+        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO 2")]);
+    }
+
+    #[test]
+    fn test_normalize_fuzzy_wins() {
+        let mut catalog = create_catalog(&[("foo", ""), ("# foo", "# FOO 2")]);
+        // Make the second message fuzzy and check that this is copied
+        // to the normalized messages.
+        catalog
+            .messages_mut()
+            .nth(1)
+            .unwrap()
+            .flags_mut()
+            .add_flag("fuzzy");
+        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO 2")]);
+    }
+
+    #[test]
+    fn test_normalize_softbreak() {
+        let catalog = create_catalog(&[("foo\nbar", "FOO\nBAR\nBAZ")]);
+        assert_normalized_messages_eq(catalog, &[exact("foo bar", "FOO BAR BAZ")]);
+    }
+
+    #[test]
+    fn test_normalize_inline_link() {
+        let catalog = create_catalog(&[(
+            "foo [bar](http://example.net/) baz",
+            "FOO [BAR](http://example.net/) BAZ",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[exact(
+                "foo [bar](http://example.net/) baz",
+                "FOO [BAR](http://example.net/) BAZ",
+            )],
+        );
+    }
+
+    #[test]
+    fn test_normalize_reference_link() {
+        // Check that we can normalize a reference link when its link
+        // definition is in a different message.
+        let catalog = create_catalog(&[
+            ("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."),
+            (
+                "foo [bar][reference-link] baz",
+                "FOO [BAR][reference-link] BAZ",
+            ),
+            ("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."),
+            (
+                "[reference-link]: http://example.net/\n\
+                 [other-link]: http://example.com/",
+                "[reference-link]: HTTP://EXAMPLE.NET/\n\
+                 [other-link]: HTTP://EXAMPLE.COM/",
+            ),
+        ]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[
+                exact("Unrelated paragraph before.", "UNRELATED PARAGRAPH BEFORE."),
+                exact(
+                    "foo [bar](http://example.net/) baz",
+                    "FOO [BAR](HTTP://EXAMPLE.NET/) BAZ",
+                ),
+                exact("Unrelated paragraph after.", "UNRELATED PARAGRAPH AFTER."),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_normalize_paragraphs() {
+        let catalog = create_catalog(&[(
+            "foo\n\n\
+             bar",
+            "FOO\n\n\
+             BAR",
+        )]);
+        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]);
+    }
+
+    #[test]
+    fn test_normalize_fuzzy_paragraphs_too_many() {
+        let catalog = create_catalog(&[(
+            "foo\n\n\
+             bar",
+            "FOO\n\n\
+             BAR\n\n\
+             BAZ",
+        )]);
+        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]);
+    }
+
+    #[test]
+    fn test_normalize_fuzzy_paragraphs_too_few() {
+        let catalog = create_catalog(&[(
+            "foo\n\n\
+             bar\n\n\
+             baz",
+            "FOO\n\n\
+             BAR",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")],
+        );
+    }
+
+    #[test]
+    fn test_normalize_list_items() {
+        let catalog = create_catalog(&[(
+            "* foo\n\
+             * bar",
+            "* FOO\n\
+             * BAR",
+        )]);
+        assert_normalized_messages_eq(catalog, &[exact("foo", "FOO"), exact("bar", "BAR")]);
+    }
+
+    #[test]
+    fn test_normalize_fuzzy_list_items_too_many() {
+        let catalog = create_catalog(&[(
+            "* foo\n\
+             * bar",
+            "* FOO\n\
+             * BAR\n\
+             * BAZ",
+        )]);
+        assert_normalized_messages_eq(catalog, &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR\n\nBAZ")]);
+    }
+
+    #[test]
+    fn test_normalize_fuzzy_list_items_too_few() {
+        let catalog = create_catalog(&[(
+            "* foo\n\
+             * bar\n\
+             * baz",
+            "* FOO\n\
+             * BAR",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[fuzzy("foo", "FOO"), fuzzy("bar", "BAR"), fuzzy("baz", "")],
+        );
+    }
+
+    #[test]
+    fn test_normalize_disappearing_html() {
+        // Normalizing "<b>" results in no messages.
+        let catalog = create_catalog(&[("<b>", "FOO")]);
+        assert_normalized_messages_eq(catalog, &[]);
+    }
+
+    #[test]
+    fn test_normalize_code_blocks() {
+        let catalog = create_catalog(&[(
+            "```rust,editable\n\
+             // Example\n\
+             foo\n\
+             \n\
+             * bar\n\
+             ```",
+            "```rust,editable\n\
+             // Beispiel\n\
+             FOO\n\
+             \n\
+             * BAR\n\
+             ```",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[exact(
+                "```rust,editable\n\
+                 // Example\n\
+                 foo\n\
+                 \n\
+                 * bar\n\
+                 ```",
+                "```rust,editable\n\
+                 // Beispiel\n\
+                 FOO\n\
+                 \n\
+                 * BAR\n\
+                 ```",
+            )],
+        );
+    }
+
+    #[test]
+    fn test_normalize_block_quote() {
+        let catalog = create_catalog(&[(
+            "> foo bar\n\
+             > baz",
+            "> FOO BAR\n\
+             > BAZ",
+        )]);
+        assert_normalized_messages_eq(catalog, &[exact("foo bar baz", "FOO BAR BAZ")]);
+    }
+
+    #[test]
+    fn test_normalize_block_quote_with_list() {
+        let catalog = create_catalog(&[(
+            "> * foo bar\n\
+             >   baz\n\
+             > * quux",
+            "> * FOO BAR\n\
+             >   BAZ\n\
+             > * QUUX",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[exact("foo bar baz", "FOO BAR BAZ"), exact("quux", "QUUX")],
+        );
+    }
+
+    #[test]
+    fn test_normalize_table() {
+        let catalog = create_catalog(&[(
+            "\
+            |        | Types       |\n\
+            |--------|-------------|\n\
+            | Arrays | `[T; N]`    |\n\
+            | Tuples | `()`, ...   |",
+            "\
+            |   | TYPES |\n\
+            |---|---|\n\
+            | ARRAYS | `[T; N]`  |\n\
+            | TUPLES | `()`, ... |",
+        )]);
+        assert_normalized_messages_eq(
+            catalog,
+            &[
+                exact("Types", "TYPES"),
+                exact("Arrays", "ARRAYS"),
+                exact("`[T; N]`", "`[T; N]`"),
+                exact("Tuples", "TUPLES"),
+                exact("`()`, ...", "`()`, ..."),
+            ],
+        );
+    }
+}

From 0e54cde2619ac73fc10caa4392a9585994e96894 Mon Sep 17 00:00:00 2001
From: Darkhan Kubigenov <kdark@google.com>
Date: Fri, 1 Sep 2023 13:57:00 +0000
Subject: [PATCH 2/4] Add new fuzz target `normalize`

I tested this fuzz target with changes from
https://github.com/google/mdbook-i18n-helpers/pull/56 reverted
and it does detect the panic.
---
 fuzz/Cargo.lock                | 22 ++++++++++++----------
 fuzz/Cargo.toml                |  7 +++++++
 fuzz/fuzz_targets/normalize.rs | 25 +++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 10 deletions(-)
 create mode 100644 fuzz/fuzz_targets/normalize.rs

diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
index ce77e644..157488d5 100644
--- a/fuzz/Cargo.lock
+++ b/fuzz/Cargo.lock
@@ -483,13 +483,14 @@ dependencies = [
 
 [[package]]
 name = "mdbook-i18n-helpers"
-version = "0.1.0"
+version = "0.2.2"
 dependencies = [
  "anyhow",
  "mdbook",
  "polib",
  "pulldown-cmark",
  "pulldown-cmark-to-cmark",
+ "regex",
  "semver",
  "serde_json",
 ]
@@ -500,14 +501,15 @@ version = "0.0.0"
 dependencies = [
  "libfuzzer-sys",
  "mdbook-i18n-helpers",
+ "polib",
  "pretty_assertions",
 ]
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
 
 [[package]]
 name = "num-traits"
@@ -657,13 +659,13 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.9.0"
+version = "1.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484"
+checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.3.0",
+ "regex-automata 0.3.8",
  "regex-syntax",
 ]
 
@@ -675,9 +677,9 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
 
 [[package]]
 name = "regex-automata"
-version = "0.3.0"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56"
+checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -686,9 +688,9 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.7.3"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
+checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
 
 [[package]]
 name = "rustix"
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index d6c68f1b..f1d3322b 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -9,6 +9,7 @@ cargo-fuzz = true
 
 [dependencies]
 libfuzzer-sys = "0.4"
+polib = "0.2.0"
 pretty_assertions = "1.3.0"
 
 [dependencies.mdbook-i18n-helpers]
@@ -26,3 +27,9 @@ name = "group_events"
 path = "fuzz_targets/group_events.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "normalize"
+path = "fuzz_targets/normalize.rs"
+test = false
+doc = false
diff --git a/fuzz/fuzz_targets/normalize.rs b/fuzz/fuzz_targets/normalize.rs
new file mode 100644
index 00000000..51128ebc
--- /dev/null
+++ b/fuzz/fuzz_targets/normalize.rs
@@ -0,0 +1,25 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use mdbook_i18n_helpers::normalize::normalize;
+use polib::catalog::Catalog;
+use polib::message::Message;
+use polib::metadata::CatalogMetadata;
+
+fuzz_target!(|translations: Vec<(&str, &str)>| {
+    let catalog = create_catalog(translations);
+    let _ = normalize(catalog);
+});
+
+fn create_catalog(translations: Vec<(&str, &str)>) -> Catalog {
+    let mut catalog = Catalog::new(CatalogMetadata::new());
+    for (idx, (msgid, msgstr)) in translations.iter().enumerate() {
+        let message = Message::build_singular()
+            .with_source(format!("foo.md:{idx}"))
+            .with_msgid(String::from(*msgid))
+            .with_msgstr(String::from(*msgstr))
+            .done();
+        catalog.append_or_update(message);
+    }
+    catalog
+}

From 8fdf4be233443f2041f2bf311221e78c2668c925 Mon Sep 17 00:00:00 2001
From: Darkhan Kubigenov <kdark@google.com>
Date: Fri, 15 Sep 2023 17:36:32 +0000
Subject: [PATCH 3/4] Update github workflow to run the new fuzzer normalize

---
 .github/workflows/test.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 05dde6a2..36afd900 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -48,11 +48,15 @@ jobs:
           restore-keys: |
             fuzz-corpus
 
-      - name: Run fuzz test
-        run: cargo fuzz run group_events -- -only_ascii=1 -max_total_time=30
-
-      - name: Minimize fuzz corpus
-        run: cargo fuzz cmin group_events
+      - name: Run group_events fuzzer and minimize corpus
+        run: |
+          cargo fuzz run group_events -- -only_ascii=1 -max_total_time=30
+          cargo fuzz cmin group_events
+
+      - name: Run normalize fuzzer and minimize corpus
+        run: |
+          cargo fuzz run normalize -- -only_ascii=1 -max_total_time=30
+          cargo fuzz cmin normalize
 
   clippy:
     name: Clippy

From c77777c46443e766bd4f484fd7ab2b5abca23aab Mon Sep 17 00:00:00 2001
From: Martin Geisler <martin@geisler.net>
Date: Sat, 16 Sep 2023 16:27:21 +0200
Subject: [PATCH 4/4] Fix typo in src/normalize.rs

---
 src/normalize.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/normalize.rs b/src/normalize.rs
index e3656190..009a69a0 100644
--- a/src/normalize.rs
+++ b/src/normalize.rs
@@ -1,4 +1,4 @@
-//! Normalize the Markdown in a  a PO or POT file.
+//! Normalize the Markdown in a PO or POT file.
 //!
 //! This file contains main logic used by the binary `mdbook-i18n-normalize`.