Skip to content

Commit

Permalink
Fail more helpfully on unsupported encodings in pack_sim
Browse files Browse the repository at this point in the history
  • Loading branch information
mferrera committed May 31, 2023
1 parent 653515f commit c9818c7
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 26 deletions.
79 changes: 53 additions & 26 deletions src/subscript/pack_sim/pack_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from io import StringIO
from pathlib import Path
from shutil import copy
from typing import Dict, Optional, TextIO, Union
from typing import Dict, List, Optional, TextIO, Union

from subscript import __version__, getLogger

Expand Down Expand Up @@ -101,7 +101,10 @@ def _md5checksum(
"""

logger.info("in md5!")

def _md5_on_fhandle(fhandle: TextIO) -> str:
logger.info(f"in md5 fhandle: {fhandle}")
md5hash = hashlib.md5()
wholefile = str(fhandle.read())
md5hash.update("".join(wholefile.splitlines()).encode("utf-8"))
Expand Down Expand Up @@ -139,33 +142,57 @@ def _get_paths(filename: Path, org_sim_loc: Path) -> Dict[str, Path]:
# Check if the filename can be found
filename = _expand_filename(filename, org_sim_loc)

with open(filename, "r", encoding="utf8") as fhandle:
# Read through all lines of text
for line in fhandle:
line_strip = line.strip()

if line_strip.startswith("PATHS"):
logger.info("Found Eclipse PATHS keyword, creating a dictionary.")

# In the keyword, find the path definitions and ignore comments
for innerline in fhandle:
line_strip = innerline.strip()
if line_strip.startswith("--"):
continue
try:
with open(filename, encoding="utf-8") as fin:
lines = fin.readlines()
except UnicodeDecodeError as e:
error_words = str(e).split(" ")
hex_str = error_words[error_words.index("byte") + 1]
try:
bad_char = chr(int(hex_str, 16))
except ValueError:
bad_char = f"hex:{hex_str}"
with open(filename, "rb") as fin:
byte_lines: List[bytes] = fin.readlines()

for i, byte_line in enumerate(byte_lines):
try:
byte_line.decode("utf-8")
except UnicodeDecodeError:
bad_line_num = i + 1
e.reason = (
f"Unsupported non-UTF-8 character {bad_char!r} "
f"found in file: {filename.name} on line {bad_line_num}"
)
break
raise e

if innerline.split("--")[0].strip() == "/":
# Finished reading the data for the PATHS keyword
break
# Read through all lines of text
for line in lines:
line_strip = line.strip()

# Assume we have found a PATHS definition line
try:
path_info = innerline.split("--")[0].strip().split("'")
paths[path_info[1]] = Path(path_info[3])
except IndexError:
logger.warning(
"Could not parse %s as a PATHS definition, skipping",
line_strip,
)
if line_strip.startswith("PATHS"):
logger.info("Found Eclipse PATHS keyword, creating a dictionary.")

# In the keyword, find the path definitions and ignore comments
for innerline in lines:
line_strip = innerline.strip()
if line_strip.startswith("--"):
continue

if innerline.split("--")[0].strip() == "/":
# Finished reading the data for the PATHS keyword
break

# Assume we have found a PATHS definition line
try:
path_info = innerline.split("--")[0].strip().split("'")
paths[path_info[1]] = Path(path_info[3])
except IndexError:
logger.warning(
"Could not parse %s as a PATHS definition, skipping",
line_strip,
)
logger.debug("Dictionary created: %s", str(paths))
return paths

Expand Down
29 changes: 29 additions & 0 deletions tests/test_pack_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,35 @@ def test_main_fmu(tmp_path, mocker):
assert Path("include/props/reek.pvt").exists()


def test_helpful_latin1_encoding_exception(tmp_path, mocker):
"""Test that a more helpful error message is given when a file with an
unsupported encoding is given"""
tmp_data_file = tmp_path / "TMP.DATA"
with open(tmp_data_file, "w", encoding="iso-8859-1") as fout:
fout.write("-- død")
mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file), "."])
with pytest.raises(
UnicodeDecodeError, match=(f"'ø' found in file: {tmp_data_file.name} on line 1")
):
pack_sim.main()

tmp_data_file2 = tmp_path / "TMP2.DATA"
with open(tmp_data_file2, "w", encoding="iso-8859-1") as fout:
fout.write("-- A\nRUNSPEC\n-- på sjøen")
mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file2), "."])
with pytest.raises(
UnicodeDecodeError,
match=(f"'å' found in file: {tmp_data_file2.name} on line 3"),
):
pack_sim.main()

tmp_data_file3 = tmp_path / "TMP3.DATA"
with open(tmp_data_file3, "w", encoding="utf-8") as fout:
fout.write(f"INCLUDE\n '{tmp_data_file.name}' /")
mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file3), "."])
pack_sim.main()


def test_repeated_run(tmp_path, mocker):
"""Test what happens on repeated incovations"""
os.chdir(tmp_path)
Expand Down

0 comments on commit c9818c7

Please sign in to comment.