Skip to content

Commit

Permalink
Html Reader Not Handling non-ASCII Data Correctly
Browse files Browse the repository at this point in the history
Fix PHPOffice#2942. Code was changed by PHPOffice#2894 because PHP8.2 will deprecate how it was being done. See linked issue for more details. Dom loadhtml assumes ISO-8859-1 in the absence of a charset attribute or equivalent, and there is no way to override that assumption. Sigh. The suggested replacements are unsuitable in one way or another. I think this will work with minimal disruption (replace ampersand, less than, and greater than with entities representing illegal characters, then use htmlentities, then restore ampersand, less than, and greater than).
  • Loading branch information
oleibman committed Jul 15, 2022
1 parent 99ce5c2 commit 6efcb6c
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 1 deletion.
34 changes: 33 additions & 1 deletion src/PhpSpreadsheet/Reader/Html.php
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ private static function containsTags(string $data): bool
/**
* Loads Spreadsheet from file.
*/
protected function loadSpreadsheetFromFile(string $filename): Spreadsheet
public function loadSpreadsheetFromFile(string $filename): Spreadsheet
{
// Create new Spreadsheet
$spreadsheet = new Spreadsheet();
Expand Down Expand Up @@ -651,6 +651,22 @@ public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
// Reload the HTML file into the DOM object
try {
$convert = $this->securityScanner->scanFile($filename);
/*if (substr($convert, 0, 6) !== '<?xml ') {
$convert = '<?xml encoding = "UTF-8">' . $convert;
}*/
// Surrogate characters should not be valid in html.
// Ampersand must be replaced before < and >.
$convert = str_replace(
['&', '<', '>'],
['&#xd800;', '&#xd801;', '&#xd802;'],
$convert
);
$convert = htmlentities($convert, ENT_NOQUOTES, 'UTF-8');
$convert = str_replace(
['&amp;#xd800;', '&amp;#xd801;', '&amp;#xd802;'],
['&', '<', '>'],
$convert
);
$loaded = $dom->loadHTML($convert);
} catch (Throwable $e) {
$loaded = false;
Expand All @@ -674,6 +690,22 @@ public function loadFromString($content, ?Spreadsheet $spreadsheet = null): Spre
// Reload the HTML file into the DOM object
try {
$convert = $this->securityScanner->scan($content);
/*if (substr($convert, 0, 6) !== '<?xml ') {
$convert = '<?xml encoding = "UTF-8">' . $convert;
}*/
// Surrogate characters should not be valid in html.
// Ampersand must be replaced before < and >.
$convert = str_replace(
['&', '<', '>'],
['&#xd800;', '&#xd801;', '&#xd802;'],
$convert
);
$convert = htmlentities($convert, ENT_NOQUOTES, 'UTF-8');
$convert = str_replace(
['&amp;#xd800;', '&amp;#xd801;', '&amp;#xd802;'],
['&', '<', '>'],
$convert
);
$loaded = $dom->loadHTML($convert);
} catch (Throwable $e) {
$loaded = false;
Expand Down
28 changes: 28 additions & 0 deletions tests/PhpSpreadsheetTests/Reader/Html/Issue2942Test.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;

use PhpOffice\PhpSpreadsheet\Reader\Html;
use PHPUnit\Framework\TestCase;

class Issue2942Test extends TestCase
{
public function testLoadFromString(): void
{
$content = '<table><tbody><tr><td>éàâèî</td></tr></tbody></table>';
$reader = new Html();
$spreadsheet = $reader->loadFromString($content);
$sheet = $spreadsheet->getActiveSheet();
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
}

public function testLoadFromFile(): void
{
$file = 'tests/data/Reader/HTML/utf8chars.html';
$reader = new Html();
$spreadsheet = $reader->loadSpreadsheetFromFile($file);
$sheet = $spreadsheet->getActiveSheet();
self::assertSame('éàâèî', $sheet->getCell('A1')->getValue());
self::assertSame('αβγδε', $sheet->getCell('B1')->getValue());
}
}
17 changes: 17 additions & 0 deletions tests/data/Reader/HTML/utf8chars.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html>
<head>
<!-- deliberately do not identify charset for this test -->
<title>Test Utf-8 characters</title>
</head>
<body>
<table>
<tbody>
<tr>
<td>éàâèî</td>
<td>αβγδε</td>
</tr>
</tbody>
</table>
</body>
</html>

0 comments on commit 6efcb6c

Please sign in to comment.