Skip to content

Commit

Permalink
iterparse: ignore "strip_cdata" when parsing HTML (GH-450)
Browse files Browse the repository at this point in the history
Commit b79424c
deprecated the strip_cdata argument to the HTML parser, causing all uses of iterparse()
to trigger its DeprecationWarning (due to the default True value).

Remove the strip_cdata argument from the HTML parser's arguments,
and document it as ignored in iterparse() except for XML documents.

See https://bugs.launchpad.net/lxml/+bug/2067707
  • Loading branch information
ferdnyc authored and scoder committed Feb 3, 2025
1 parent e73c466 commit 306041e
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/lxml/iterparse.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ cdef class iterparse:
- remove_blank_text: discard blank text nodes
- remove_comments: discard comments
- remove_pis: discard processing instructions
- strip_cdata: replace CDATA sections by normal text content (default: True)
- strip_cdata: replace CDATA sections by normal text content (default:
True for XML, ignored otherwise)
- compact: safe memory for short text content (default: True)
- resolve_entities: replace entities by their text value (default: True)
- huge_tree: disable security restrictions and support very deep trees
Expand Down Expand Up @@ -97,7 +98,6 @@ cdef class iterparse:
remove_blank_text=remove_blank_text,
remove_comments=remove_comments,
remove_pis=remove_pis,
strip_cdata=strip_cdata,
no_network=no_network,
target=None, # TODO
schema=schema,
Expand Down
40 changes: 40 additions & 0 deletions src/lxml/tests/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,46 @@ def test_html_iterparse_start(self):
('start', root[1]), ('start', root[1][0])],
events)

def test_html_iterparse_cdata(self):
import warnings

iterparse = self.etree.iterparse
f = BytesIO(b'<html><body><![CDATA[ foo ]]></body></html>')

with warnings.catch_warnings(record=True) as warned_novalue:
warnings.simplefilter("always")
iterator = iterparse(f, html=True, events=('start', ))
self.assertFalse(warned_novalue)

events = list(iterator)
root = iterator.root
self.assertNotEqual(None, root)
self.assertEqual(('start', root), events[0])

f.seek(0)
with warnings.catch_warnings(record=True) as warned_true:
warnings.simplefilter("always")
iterator = iterparse(
f, html=True, events=('start', ), strip_cdata=True)
self.assertFalse(warned_true)

events = list(iterator)
root = iterator.root
self.assertNotEqual(None, root)
self.assertEqual(('start', root), events[0])

f.seek(0)
with warnings.catch_warnings(record=True) as warned_false:
warnings.simplefilter("always")
iterator = iterparse(
f, html=True, events=('start', ), strip_cdata=False)
self.assertFalse(warned_false)

events = list(iterator)
root = iterator.root
self.assertNotEqual(None, root)
self.assertEqual(('start', root), events[0])

def test_html_feed_parser(self):
parser = self.etree.HTMLParser()
parser.feed("<html><body></")
Expand Down

0 comments on commit 306041e

Please sign in to comment.