Skip to content

Commit

Permalink
Merge pull request #883 from googlefonts/improve-html-formatter
Browse files Browse the repository at this point in the history
Improve HTML formatter & activate it in new packager
  • Loading branch information
m4rc1e authored Mar 22, 2024
2 parents c03a5e5 + bcebd27 commit 6422c80
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 1 deletion.
8 changes: 7 additions & 1 deletion Lib/gftools/packager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from gftools.scripts.add_font import main as add_font
from gftools.tags import GFTags
from gftools.util import google_fonts as fonts
from gftools.utils import download_file, is_google_fonts_repo
from gftools.utils import download_file, is_google_fonts_repo, format_html

log = logging.getLogger("gftools.packager")
LOG_FORMAT = "%(message)s"
Expand Down Expand Up @@ -307,6 +307,12 @@ def package_family(
os.remove(fp)
shutil.copytree(tmp_dir, family_path, dirs_exist_ok=True)
save_metadata(family_path / "METADATA.pb", metadata)
# Format HTML
desc_file = family_path / "DESCRIPTION.en_us.html"
with open(desc_file, encoding="utf-8") as fin:
description = format_html(fin.read())
with open(desc_file, "w", encoding="utf-8") as fout:
fout.write(description)
return True


Expand Down
27 changes: 27 additions & 0 deletions Lib/gftools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,33 @@ def _html_custom_formatter(string):
string = string.replace(". ", f".\n{whitespace}")
string = string.replace("! ", f"!\n{whitespace}")
string = string.replace("? ", f"?\n{whitespace}")
# Read into list
strings = string.split("\n")
# Cycle through list to find abbreviations
for i in range(1, len(strings)):
this_line = strings[i-1]
next_line = strings[i]
if this_line == "":
continue
if (
re.search(r"i\.?e\.$", this_line) # ie.
or re.search(r"e\.?g\.$", this_line) # eg.
or (
re.search(r"etc[\.|\?|!]$", this_line)
and next_line[1] == next_line[1].lower()
) # etc.
or (
re.search(r"\W\w{1,2}[\.|\?|!]$", this_line)
and this_line[-2] == this_line[-2].upper()
and next_line[1] == next_line[1].lower()
) # H.R. Giger
):
strings[i-1] = strings[i-1] + strings[i]
strings[i] = ""
# Join back together
string = "\n".join(strings)
# Remove double lines
string = string.replace("\n\n", "\n")
return string


Expand Down
40 changes: 40 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,43 @@ def test_remove_url_prefix(url, want):
from gftools.utils import remove_url_prefix
got = remove_url_prefix(url)
assert got == want


def test_format_html():
from gftools.utils import format_html

input = """<p>
First sentence. Second sentence.
Sentence that uses an abbreviation, e.g. "for example". Sentence that uses an abbreviation, eg. "for example".
Sentence that uses another abbreviation, i.e. "for example". Sentence that uses another abbreviation, ie. "for example".
Sentence that ends in etc. Another sentence after it.
Sentence that uses etc. but then doesn't end.
The characters of the film were designed by H.R. Giger. His alien characters became iconic throughout pop culture.
The characters of the film were designed by H.R. Giger, a Swiss sculptural artist. His alien characters became iconic throughout pop culture.
He was referred to H.R. Giger, who headed the H.R. department at the time, then told them they're fired. <-- Can't have it both ways. Legitimate abbreviations at the end of sentences can only be caught if they are known in advance, e.g. etc.
</p>
"""

output = """<p>
First sentence.
Second sentence.
Sentence that uses an abbreviation, e.g. "for example".
Sentence that uses an abbreviation, eg. "for example".
Sentence that uses another abbreviation, i.e. "for example".
Sentence that uses another abbreviation, ie. "for example".
Sentence that ends in etc.
Another sentence after it.
Sentence that uses etc. but then doesn't end.
The characters of the film were designed by H.R.
Giger.
His alien characters became iconic throughout pop culture.
The characters of the film were designed by H.R.
Giger, a Swiss sculptural artist.
His alien characters became iconic throughout pop culture.
He was referred to H.R.
Giger, who headed the H.R. department at the time, then told them they're fired.
<-- Can't have it both ways.
Legitimate abbreviations at the end of sentences can only be caught if they are known in advance, e.g. etc.
</p>
"""
assert format_html(input) == output

0 comments on commit 6422c80

Please sign in to comment.