From 6efcb6c019e7dd80306c54b6165c1c048186401a Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Fri, 15 Jul 2022 16:01:18 -0700 Subject: [PATCH] Html Reader Not Handling non-ASCII Data Correctly Fix #2942. Code was changed by #2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than). --- src/PhpSpreadsheet/Reader/Html.php | 34 ++++++++++++++++++- .../Reader/Html/Issue2942Test.php | 28 +++++++++++++++ tests/data/Reader/HTML/utf8chars.html | 17 ++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php create mode 100644 tests/data/Reader/HTML/utf8chars.html diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php index 3d859e15f6..5805a9d274 100644 --- a/src/PhpSpreadsheet/Reader/Html.php +++ b/src/PhpSpreadsheet/Reader/Html.php @@ -201,7 +201,7 @@ private static function containsTags(string $data): bool /** * Loads Spreadsheet from file. */ - protected function loadSpreadsheetFromFile(string $filename): Spreadsheet + public function loadSpreadsheetFromFile(string $filename): Spreadsheet { // Create new Spreadsheet $spreadsheet = new Spreadsheet(); @@ -651,6 +651,22 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet) // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scanFile($filename); + /*if (substr($convert, 0, 6) !== '' . $convert; + }*/ + // Surrogate characters should not be valid in html. + // Ampersand must be replaced before < and >. + $convert = str_replace( + ['&', '<', '>'], + ['�', '�', '�'], + $convert + ); + $convert = htmlentities($convert, ENT_NOQUOTES, 'UTF-8'); + $convert = str_replace( + ['&#xd800;', '&#xd801;', '&#xd802;'], + ['&', '<', '>'], + $convert + ); $loaded = $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; @@ -674,6 +690,22 @@ public function loadFromString($content, ?Spreadsheet $spreadsheet = null): Spre // Reload the HTML file into the DOM object try { $convert = $this->securityScanner->scan($content); + /*if (substr($convert, 0, 6) !== '' . $convert; + }*/ + // Surrogate characters should not be valid in html. + // Ampersand must be replaced before < and >. + $convert = str_replace( + ['&', '<', '>'], + ['�', '�', '�'], + $convert + ); + $convert = htmlentities($convert, ENT_NOQUOTES, 'UTF-8'); + $convert = str_replace( + ['&#xd800;', '&#xd801;', '&#xd802;'], + ['&', '<', '>'], + $convert + ); $loaded = $dom->loadHTML($convert); } catch (Throwable $e) { $loaded = false; diff --git a/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php new file mode 100644 index 0000000000..253be2336f --- /dev/null +++ b/tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php @@ -0,0 +1,28 @@ +éàâèî'; + $reader = new Html(); + $spreadsheet = $reader->loadFromString($content); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + } + + public function testLoadFromFile(): void + { + $file = 'tests/data/Reader/HTML/utf8chars.html'; + $reader = new Html(); + $spreadsheet = $reader->loadSpreadsheetFromFile($file); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame('éàâèî', $sheet->getCell('A1')->getValue()); + self::assertSame('αβγδε', $sheet->getCell('B1')->getValue()); + } +} diff --git a/tests/data/Reader/HTML/utf8chars.html b/tests/data/Reader/HTML/utf8chars.html new file mode 100644 index 0000000000..cd21c4a2d6 --- /dev/null +++ b/tests/data/Reader/HTML/utf8chars.html @@ -0,0 +1,17 @@ + + + + +Test Utf-8 characters + + + + + + + + + +
éàâèîαβγδε
+ +