diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 4f79b106..e8e6e503 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -30,10 +30,11 @@ def extract_items(self, document, base_url=None): def _extract_items(self, node): script = node.xpath('string()') try: - data = json.loads(script) + # TODO: `strict=False` can be configurable if needed + data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script)) + data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/tests/samples/custom.invalid/JSONLD_with_control_characters.html b/tests/samples/custom.invalid/JSONLD_with_control_characters.html new file mode 100644 index 00000000..97c65cfd --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_control_characters.html @@ -0,0 +1,17 @@ + + + +
+ + + + + + \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_with_control_characters.jsonld b/tests/samples/custom.invalid/JSONLD_with_control_characters.jsonld new file mode 100644 index 00000000..dab34a80 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_control_characters.jsonld @@ -0,0 +1,5 @@ +[ + { + "data": "line 1\n line 2\n line 3\n" + } +] \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.html b/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.html new file mode 100644 index 00000000..6949cbc1 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.html @@ -0,0 +1,15 @@ + + + + + + + + + + diff --git a/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.jsonld b/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.jsonld new file mode 100644 index 00000000..b87aaa8d --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_control_characters_comment.jsonld @@ -0,0 +1,5 @@ +[ + { + "data": "some\n text" + } +] diff --git a/tests/samples/misc/microformat_flat_test.json b/tests/samples/misc/microformat_flat_test.json index 4c952b2b..28344b3d 100644 --- a/tests/samples/misc/microformat_flat_test.json +++ b/tests/samples/misc/microformat_flat_test.json @@ -1,8 +1,8 @@ [ { "@type": [ - "h-hidden-tablet", - "h-hidden-phone" + "h-hidden-phone", + "h-hidden-tablet" ], "name": [ "" @@ -17,8 +17,8 @@ "children": [ { "@type": [ - "h-hidden-tablet", - "h-hidden-phone" + "h-hidden-phone", + "h-hidden-tablet" ], "name": [ "" @@ -72,4 +72,4 @@ "2013-06-13 12:00:00" ] } -] \ No newline at end of file +] diff --git a/tests/samples/misc/microformat_test.json b/tests/samples/misc/microformat_test.json index 24fccf25..a485db03 100644 --- a/tests/samples/misc/microformat_test.json +++ b/tests/samples/misc/microformat_test.json @@ -6,8 +6,8 @@ ] }, "type": [ - "h-hidden-tablet", - "h-hidden-phone" + "h-hidden-phone", + "h-hidden-tablet" ] }, { @@ -20,8 +20,8 @@ ] }, "type": [ - "h-hidden-tablet", - "h-hidden-phone" + "h-hidden-phone", + "h-hidden-tablet" ] }, { @@ -80,4 +80,4 @@ "h-entry" ] } -] \ No newline at end of file +] diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 003b7502..de717d02 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -5,6 +5,7 @@ from extruct.jsonld import JsonLdExtractor from tests import get_testdata + class TestJsonLD(unittest.TestCase): def test_schemaorg_CreativeWork(self): @@ -42,3 +43,21 @@ def test_jsonld_with_comments(self): jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected) + + def test_jsonld_with_control_characters(self): + page = 'JSONLD_with_control_characters' + body = get_testdata('custom.invalid', '{}.html'.format(page)) + expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8')) + + jsonlde = JsonLdExtractor() + data = jsonlde.extract(body) + self.assertEqual(data, expected) + + def test_jsonld_with_control_characters_comment(self): + page = 'JSONLD_with_control_characters_comment' + body = get_testdata('custom.invalid', '{}.html'.format(page)) + expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8')) + + jsonlde = JsonLdExtractor() + data = jsonlde.extract(body) + self.assertEqual(data, expected) diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 8ab3cb5c..db178f51 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -29,12 +29,12 @@ def test_uopengraph(self): def test_umicroformat(self): expected = [ { '@context': 'http://microformats.org/wiki/', - '@type': ['h-hidden-tablet', 'h-hidden-phone'], + '@type': ['h-hidden-phone', 'h-hidden-tablet'], 'name': ['']}, { '@context': 'http://microformats.org/wiki/', '@type': ['h-hidden-phone'], - 'children': [ { '@type': [ 'h-hidden-tablet', - 'h-hidden-phone'], + 'children': [ { '@type': [ 'h-hidden-phone', + 'h-hidden-tablet'], 'name': ['']}, { '@type': ['h-hidden-phone'], 'name': [ 'aJ Styles FastLane 2018 15 x '