From 6b59885d7fa36ed59d439a459e7a3d73f52f8b41 Mon Sep 17 00:00:00 2001 From: mferrera Date: Wed, 31 May 2023 11:18:56 +0200 Subject: [PATCH] Fail more helpfully on unsupported encodings in pack_sim --- src/subscript/pack_sim/pack_sim.py | 76 ++++++++++++++++++++---------- tests/test_pack_sim.py | 29 ++++++++++++ 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/src/subscript/pack_sim/pack_sim.py b/src/subscript/pack_sim/pack_sim.py index a0bd2f448..fb7ecc547 100755 --- a/src/subscript/pack_sim/pack_sim.py +++ b/src/subscript/pack_sim/pack_sim.py @@ -7,7 +7,7 @@ from io import StringIO from pathlib import Path from shutil import copy -from typing import Dict, Optional, TextIO, Union +from typing import Dict, List, Optional, TextIO, Union from subscript import __version__, getLogger @@ -139,33 +139,57 @@ def _get_paths(filename: Path, org_sim_loc: Path) -> Dict[str, Path]: # Check if the filename can be found filename = _expand_filename(filename, org_sim_loc) - with open(filename, "r", encoding="utf8") as fhandle: - # Read through all lines of text - for line in fhandle: - line_strip = line.strip() - - if line_strip.startswith("PATHS"): - logger.info("Found Eclipse PATHS keyword, creating a dictionary.") - - # In the keyword, find the path definitions and ignore comments - for innerline in fhandle: - line_strip = innerline.strip() - if line_strip.startswith("--"): - continue + try: + with open(filename, encoding="utf-8") as fin: + lines = fin.readlines() + except UnicodeDecodeError as e: + error_words = str(e).split(" ") + hex_str = error_words[error_words.index("byte") + 1] + try: + bad_char = chr(int(hex_str, 16)) + except ValueError: + bad_char = f"hex:{hex_str}" + with open(filename, "rb") as fin: + byte_lines: List[bytes] = fin.readlines() + + for i, byte_line in enumerate(byte_lines): + try: + byte_line.decode("utf-8") + except UnicodeDecodeError: + bad_line_num = i + 1 + e.reason = ( + f"Unsupported non-UTF-8 character {bad_char!r} " + f"found in file: {filename.name} on line {bad_line_num}" + ) + break + raise e - if innerline.split("--")[0].strip() == "/": - # Finished reading the data for the PATHS keyword - break + # Read through all lines of text + for line in lines: + line_strip = line.strip() - # Assume we have found a PATHS definition line - try: - path_info = innerline.split("--")[0].strip().split("'") - paths[path_info[1]] = Path(path_info[3]) - except IndexError: - logger.warning( - "Could not parse %s as a PATHS definition, skipping", - line_strip, - ) + if line_strip.startswith("PATHS"): + logger.info("Found Eclipse PATHS keyword, creating a dictionary.") + + # In the keyword, find the path definitions and ignore comments + for innerline in lines: + line_strip = innerline.strip() + if line_strip.startswith("--"): + continue + + if innerline.split("--")[0].strip() == "/": + # Finished reading the data for the PATHS keyword + break + + # Assume we have found a PATHS definition line + try: + path_info = innerline.split("--")[0].strip().split("'") + paths[path_info[1]] = Path(path_info[3]) + except IndexError: + logger.warning( + "Could not parse %s as a PATHS definition, skipping", + line_strip, + ) logger.debug("Dictionary created: %s", str(paths)) return paths diff --git a/tests/test_pack_sim.py b/tests/test_pack_sim.py index 77c510fdd..56a6b06b8 100644 --- a/tests/test_pack_sim.py +++ b/tests/test_pack_sim.py @@ -53,6 +53,35 @@ def test_main_fmu(tmp_path, mocker): assert Path("include/props/reek.pvt").exists() +def test_helpful_latin1_encoding_exception(tmp_path, mocker): + """Test that a more helpful error message is given when a file with an + unsupported encoding is given""" + tmp_data_file = tmp_path / "TMP.DATA" + with open(tmp_data_file, "w", encoding="iso-8859-1") as fout: + fout.write("-- død") + mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file), "."]) + with pytest.raises( + UnicodeDecodeError, match=(f"'ø' found in file: {tmp_data_file.name} on line 1") + ): + pack_sim.main() + + tmp_data_file2 = tmp_path / "TMP2.DATA" + with open(tmp_data_file2, "w", encoding="iso-8859-1") as fout: + fout.write("-- A\nRUNSPEC\n-- på sjøen") + mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file2), "."]) + with pytest.raises( + UnicodeDecodeError, + match=(f"'å' found in file: {tmp_data_file2.name} on line 3"), + ): + pack_sim.main() + + tmp_data_file3 = tmp_path / "TMP3.DATA" + with open(tmp_data_file3, "w", encoding="utf-8") as fout: + fout.write(f"INCLUDE\n '{tmp_data_file.name}' /") + mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file3), "."]) + pack_sim.main() + + def test_repeated_run(tmp_path, mocker): """Test what happens on repeated incovations""" os.chdir(tmp_path)