Fail more helpfully on unsupported encodings in pack_sim

equinor · May 31, 2023 · c9818c7 · c9818c7
1 parent 653515f
commit c9818c7
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 26 deletions.
diff --git a/src/subscript/pack_sim/pack_sim.py b/src/subscript/pack_sim/pack_sim.py
@@ -7,7 +7,7 @@
 from io import StringIO
 from pathlib import Path
 from shutil import copy
-from typing import Dict, Optional, TextIO, Union
+from typing import Dict, List, Optional, TextIO, Union
 
 from subscript import __version__, getLogger
 
@@ -101,7 +101,10 @@ def _md5checksum(
 
     """
 
+    logger.info("in md5!")
+
     def _md5_on_fhandle(fhandle: TextIO) -> str:
+        logger.info(f"in md5 fhandle: {fhandle}")
         md5hash = hashlib.md5()
         wholefile = str(fhandle.read())
         md5hash.update("".join(wholefile.splitlines()).encode("utf-8"))
@@ -139,33 +142,57 @@ def _get_paths(filename: Path, org_sim_loc: Path) -> Dict[str, Path]:
     # Check if the filename can be found
     filename = _expand_filename(filename, org_sim_loc)
 
-    with open(filename, "r", encoding="utf8") as fhandle:
-        # Read through all lines of text
-        for line in fhandle:
-            line_strip = line.strip()
-
-            if line_strip.startswith("PATHS"):
-                logger.info("Found Eclipse PATHS keyword, creating a dictionary.")
-
-                # In the keyword, find the path definitions and ignore comments
-                for innerline in fhandle:
-                    line_strip = innerline.strip()
-                    if line_strip.startswith("--"):
-                        continue
+    try:
+        with open(filename, encoding="utf-8") as fin:
+            lines = fin.readlines()
+    except UnicodeDecodeError as e:
+        error_words = str(e).split(" ")
+        hex_str = error_words[error_words.index("byte") + 1]
+        try:
+            bad_char = chr(int(hex_str, 16))
+        except ValueError:
+            bad_char = f"hex:{hex_str}"
+        with open(filename, "rb") as fin:
+            byte_lines: List[bytes] = fin.readlines()
+
+        for i, byte_line in enumerate(byte_lines):
+            try:
+                byte_line.decode("utf-8")
+            except UnicodeDecodeError:
+                bad_line_num = i + 1
+                e.reason = (
+                    f"Unsupported non-UTF-8 character {bad_char!r} "
+                    f"found in file: {filename.name} on line {bad_line_num}"
+                )
+                break
+        raise e
 
-                    if innerline.split("--")[0].strip() == "/":
-                        # Finished reading the data for the PATHS keyword
-                        break
+    # Read through all lines of text
+    for line in lines:
+        line_strip = line.strip()
 
-                    # Assume we have found a PATHS definition line
-                    try:
-                        path_info = innerline.split("--")[0].strip().split("'")
-                        paths[path_info[1]] = Path(path_info[3])
-                    except IndexError:
-                        logger.warning(
-                            "Could not parse %s as a PATHS definition, skipping",
-                            line_strip,
-                        )
+        if line_strip.startswith("PATHS"):
+            logger.info("Found Eclipse PATHS keyword, creating a dictionary.")
+
+            # In the keyword, find the path definitions and ignore comments
+            for innerline in lines:
+                line_strip = innerline.strip()
+                if line_strip.startswith("--"):
+                    continue
+
+                if innerline.split("--")[0].strip() == "/":
+                    # Finished reading the data for the PATHS keyword
+                    break
+
+                # Assume we have found a PATHS definition line
+                try:
+                    path_info = innerline.split("--")[0].strip().split("'")
+                    paths[path_info[1]] = Path(path_info[3])
+                except IndexError:
+                    logger.warning(
+                        "Could not parse %s as a PATHS definition, skipping",
+                        line_strip,
+                    )
     logger.debug("Dictionary created: %s", str(paths))
     return paths
 

diff --git a/tests/test_pack_sim.py b/tests/test_pack_sim.py
@@ -53,6 +53,35 @@ def test_main_fmu(tmp_path, mocker):
     assert Path("include/props/reek.pvt").exists()
 
 
+def test_helpful_latin1_encoding_exception(tmp_path, mocker):
+    """Test that a more helpful error message is given when a file with an
+    unsupported encoding is given"""
+    tmp_data_file = tmp_path / "TMP.DATA"
+    with open(tmp_data_file, "w", encoding="iso-8859-1") as fout:
+        fout.write("-- død")
+    mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file), "."])
+    with pytest.raises(
+        UnicodeDecodeError, match=(f"'ø' found in file: {tmp_data_file.name} on line 1")
+    ):
+        pack_sim.main()
+
+    tmp_data_file2 = tmp_path / "TMP2.DATA"
+    with open(tmp_data_file2, "w", encoding="iso-8859-1") as fout:
+        fout.write("-- A\nRUNSPEC\n-- på sjøen")
+    mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file2), "."])
+    with pytest.raises(
+        UnicodeDecodeError,
+        match=(f"'å' found in file: {tmp_data_file2.name} on line 3"),
+    ):
+        pack_sim.main()
+
+    tmp_data_file3 = tmp_path / "TMP3.DATA"
+    with open(tmp_data_file3, "w", encoding="utf-8") as fout:
+        fout.write(f"INCLUDE\n  '{tmp_data_file.name}' /")
+    mocker.patch("sys.argv", ["pack_sim", str(tmp_data_file3), "."])
+    pack_sim.main()
+
+
 def test_repeated_run(tmp_path, mocker):
     """Test what happens on repeated incovations"""
     os.chdir(tmp_path)