Skip to content

Commit

Permalink
Rework mimetypes handling (#75)
Browse files Browse the repository at this point in the history
- Fix an issue that native (Python and platform) MIME=>ext conversions are not patched.
- Allow user config to overwrite the patch.
- Patch mimetypes.init() to defer patching and improve the performance, and allow unittest mockings for the user config directory to work for the mimetypes module.
  • Loading branch information
danny0838 committed Jan 31, 2024
1 parent d851b3e commit ba979fc
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 56 deletions.
64 changes: 63 additions & 1 deletion tests/test_mimetypes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import importlib
import os
import tempfile
import unittest
from unittest import mock

from webscrapbook import WSB_USER_DIR
from webscrapbook._polyfill import mimetypes

from . import TEMP_DIR


def setUpModule():
# mock out user config
Expand All @@ -14,15 +19,25 @@ def setUpModule():
for mocking in mockings:
mocking.start()

# Since our mimetypes patch is one-time, we need to reload the modules to
# reapply the patch on the reinited mimetypes database.
# This is also required in Python < 3.7.5, in which no default maps exist and
# `mimetypes.init()` cannot recover the default maps.
importlib.reload(mimetypes._mimetypes)
importlib.reload(mimetypes)


def tearDownModule():
# stop mock
for mocking in mockings:
mocking.stop()

importlib.reload(mimetypes._mimetypes)
importlib.reload(mimetypes)


class TestMimetypes(unittest.TestCase):
def test_overridden_mimetypes(self):
def test_patch_ext2type(self):
self.assertEqual(
mimetypes.guess_type('myfile.htz'),
('application/html+zip', None),
Expand Down Expand Up @@ -56,6 +71,53 @@ def test_overridden_mimetypes(self):
('image/x-icon', None),
)

def test_patch_type2ext(self):
self.assertEqual(
mimetypes.guess_extension('text/javascript'),
'.js',
)

def test_user_config(self):
"""Test if user config works."""
with tempfile.TemporaryDirectory(prefix='mimetypes-', dir=TEMP_DIR) as tmpdir:
user_config_dir = os.path.normpath(os.path.join(tmpdir, WSB_USER_DIR))
os.makedirs(user_config_dir)
with open(os.path.join(user_config_dir, mimetypes.WSB_USER_MIMETYPES), 'w', encoding='UTF-8') as fh:
# poison with bad/invalid conversions that are unlikely really used
fh.write("""\
user/.type js
user/.type2 js
text/javascript .userext
text/javascript .userext2 .userext3
""")

try:
with mock.patch('webscrapbook.Config.user_config_dir', return_value=os.devnull):
importlib.reload(mimetypes._mimetypes)
importlib.reload(mimetypes)

# get the default conversion
js_exts = mimetypes.guess_all_extensions('text/javascript')

with mock.patch('webscrapbook.Config.user_config_dir', return_value=user_config_dir):
importlib.reload(mimetypes._mimetypes)
importlib.reload(mimetypes)

# last-win (overwrite built-in)
self.assertEqual(
mimetypes.guess_type('abc.js'),
('user/.type2', None),
)

# first-win (add to last extensions)
self.assertEqual(
mimetypes.guess_all_extensions('text/javascript'),
js_exts + ['..userext', '..userext2', '..userext3'],
)
finally:
importlib.reload(mimetypes._mimetypes)
importlib.reload(mimetypes)


if __name__ == '__main__':
unittest.main()
149 changes: 96 additions & 53 deletions webscrapbook/_polyfill/mimetypes.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,103 @@
import mimetypes as _mimetypes
import os
from mimetypes import *

from .. import Config

__all__ = _mimetypes.__all__
WSB_USER_MIMETYPES = 'mime.types'


def _patch_mimetypes():
patch_types_map = {
# WebScrapBook related
'.htz': 'application/html+zip',
'.maff': 'application/x-maff',
'.wsba': 'application/wsba+zip',

# Some common types
'.md': 'text/markdown',
'.mkd': 'text/markdown',
'.mkdn': 'text/markdown',
'.mdwn': 'text/markdown',
'.mdown': 'text/markdown',
'.markdown': 'text/markdown',
'.rss': 'application/rss+xml',
'.atom': 'application/atom+xml',
'.woff': 'font/woff',
'.woff2': 'font/woff2',
'.webp': 'image/webp',
'.weba': 'audio/weba',
'.webm': 'video/webm',
'.oga': 'audio/ogg',
'.ogv': 'video/ogg',
'.ogx': 'application/ogg', # IANA
'.ogg': 'application/ogg', # MAFF
'.vtt': 'text/vtt',
'.swf': 'application/x-shockwave-flash', # Apache, nginx, etc.
'.jar': 'application/java-archive',
'.class': 'application/java-vm',
'.epub': 'application/epub+zip',
'.7z': 'application/x-7z-compressed',
'.rar': 'application/vnd.rar',

# .js is mapped to application/javascript or application/x-javascript in some OS
# ref: https://www.ietf.org/rfc/rfc9239.txt
# text/javascript is mapped to .es in Debian 12
'.js': 'text/javascript',

# .bmp is mapped to image/x-ms-bmp in Python < 3.11
# ref: https://github.com/python/cpython/issues/86194
'.bmp': 'image/bmp',

# .ico is mapped to image/vnd.microsoft.icon in Python,
# which is not actually used by Microsoft softwares and causes
# a compatibility issue in IE9.
# ref: https://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type
'.ico': 'image/x-icon',

# .zip is mapped to application/x-zip-compressed in Windows
'.zip': 'application/zip',
}

def patch_db(db):
# apply the patch
patch_types_map_inv = {}
for ext, type in patch_types_map.items():
db.types_map[True][ext] = type
patch_types_map_inv.setdefault(type, []).append(ext)
for type, exts in patch_types_map_inv.items():
entry = db.types_map_inv[True].setdefault(type, [])
for ext in exts:
try:
entry.remove(ext)
except ValueError:
pass
entry[0:0] = exts

# add custom user MIME types mapping
_mimetypes.knownfiles += [os.path.join(Config.user_config_dir(), 'mime.types')]

# WebScrapBook related
_mimetypes.add_type('application/html+zip', '.htz')
_mimetypes.add_type('application/x-maff', '.maff')
_mimetypes.add_type('application/wsba+zip', '.wsba')

# Some common types
_mimetypes.add_type('text/markdown', '.md')
_mimetypes.add_type('text/markdown', '.mkd')
_mimetypes.add_type('text/markdown', '.mkdn')
_mimetypes.add_type('text/markdown', '.mdwn')
_mimetypes.add_type('text/markdown', '.mdown')
_mimetypes.add_type('text/markdown', '.markdown')
_mimetypes.add_type('application/rss+xml', '.rss')
_mimetypes.add_type('application/atom+xml', '.atom')
_mimetypes.add_type('font/woff', '.woff')
_mimetypes.add_type('font/woff2', '.woff2')
_mimetypes.add_type('image/webp', '.webp')
_mimetypes.add_type('audio/weba', '.weba')
_mimetypes.add_type('video/webm', '.webm')
_mimetypes.add_type('audio/ogg', '.oga')
_mimetypes.add_type('video/ogg', '.ogv')
_mimetypes.add_type('application/ogg', '.ogx') # IANA
_mimetypes.add_type('application/ogg', '.ogg') # MAFF
_mimetypes.add_type('text/vtt', '.vtt')
_mimetypes.add_type('application/x-shockwave-flash', '.swf') # Apache, nginx, etc.
_mimetypes.add_type('application/java-archive', '.jar')
_mimetypes.add_type('application/java-vm', '.class')
_mimetypes.add_type('application/epub+zip', '.epub')
_mimetypes.add_type('application/x-7z-compressed', '.7z')
_mimetypes.add_type('application/vnd.rar', '.rar')

# .js is mapped to application/javascript or application/x-javascript in some OS
# ref: https://www.ietf.org/rfc/rfc9239.txt
# text/javascript is mapped to .es in Debian 12
_mimetypes.add_type('text/javascript', '.js')

# .bmp is mapped to image/x-ms-bmp in Python < 3.11
# ref: https://github.com/python/cpython/issues/86194
_mimetypes.add_type('image/bmp', '.bmp')

# .ico is mapped to image/vnd.microsoft.icon in Python,
# which is not actually used by Microsoft softwares and causes
# a compatibility issue in IE9.
# ref: https://en.wikipedia.org/wiki/ICO_%28file_format%29#MIME_type
_mimetypes.add_type('image/x-icon', '.ico')

# .zip is mapped to application/x-zip-compressed in Windows
_mimetypes.add_type('application/zip', '.zip')
# load user mappings
for file in (os.path.join(Config.user_config_dir(), WSB_USER_MIMETYPES),):
if os.path.isfile(file):
db.read(file)

if _mimetypes.inited:
patch_db(_mimetypes._db)
else:
# patch init
patched = False
_init = _mimetypes.init

def init(files=None):
nonlocal patched
_init(files)
if not patched:
patch_db(_mimetypes._db)
patched = True

_mimetypes.init = init


_patch_mimetypes()

# export all attributes
from mimetypes import * # noqa: E402

__all__ = _mimetypes.__all__
8 changes: 6 additions & 2 deletions webscrapbook/resources/mimetypes.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@ MIME type mappings for WebScrapBook are defined by:

1. the default mappings of the native Python code
2. the system-wide registry
3. the user config file for WebScrapBook
4. the internal mappings of WebScrapBook
3. the internal mappings of WebScrapBook
4. the user config file for WebScrapBook

For conflicting definitions, a conversion of file extension to MIME type is
handled in a last-win manner, while a conversion of MIME type to file
extension(s) is handled in a first-win manner.

As an exception, the internal mappings of WebScrapBook overwrites any
conflicting mappings of the prior ones, to fix known mapping issues in native
Python and platforms.


## System registry

Expand Down

0 comments on commit ba979fc

Please sign in to comment.