From 483ee449577a70612899ad233bffc5130385af8d Mon Sep 17 00:00:00 2001 From: "Geoffrey B. Eisenbarth" Date: Thu, 20 May 2021 20:44:13 -0500 Subject: [PATCH] ENH: loosen XLS signature (#41321) --- doc/source/whatsnew/v1.3.0.rst | 4 +++- pandas/io/excel/_base.py | 30 ++++++++++++++++++--------- pandas/tests/io/excel/test_readers.py | 15 ++++++++++++-- pandas/tests/io/excel/test_xlrd.py | 18 ++++++++++++++++ 4 files changed, 54 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6f39dc4917024a..74f9af5bf8447d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -197,7 +197,7 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) -- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) +- :func:`pandas.read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -850,6 +850,8 @@ I/O - Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) +- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) +- Period ^^^^^^ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index cf2246f917bbec..9b8e40a9775454 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1014,16 +1014,21 @@ def close(self): return content -XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +XLS_SIGNATURES = ( + b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 + b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 + b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 + b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary +) ZIP_SIGNATURE = b"PK\x03\x04" -PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) +PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) @doc(storage_options=_shared_docs["storage_options"]) def inspect_excel_format( content_or_path: FilePathOrBuffer, storage_options: StorageOptions = None, -) -> str: +) -> str | None: """ Inspect the path or content of an excel file and get its format. @@ -1037,8 +1042,8 @@ def inspect_excel_format( Returns ------- - str - Format of file. + str or None + Format of file if it can be determined. Raises ------ @@ -1063,10 +1068,10 @@ def inspect_excel_format( peek = buf stream.seek(0) - if peek.startswith(XLS_SIGNATURE): + if any(peek.startswith(sig) for sig in XLS_SIGNATURES): return "xls" elif not peek.startswith(ZIP_SIGNATURE): - raise ValueError("File is not a recognized excel file") + return None # ZipFile typing is overly-strict # https://github.com/python/typeshed/issues/4212 @@ -1174,8 +1179,12 @@ def __init__( ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) + if ext is None: + raise ValueError( + "Excel file format cannot be determined, you must specify " + "an engine manually." + ) - # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") @@ -1190,12 +1199,13 @@ def __init__( path_or_buffer, storage_options=storage_options ) - if ext != "xls" and xlrd_version >= Version("2"): + # Pass through if ext is None, otherwise check if ext valid for xlrd + if ext and ext != "xls" and xlrd_version >= Version("2"): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - elif ext != "xls": + elif ext and ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a46cb70097bd8e..aec638a0d86126 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -727,9 +727,20 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - if engine is None or engine == "xlrd": + if engine is None: error = ValueError - msg = "File is not a recognized excel file" + msg = ( + "Excel file format cannot be determined, you must " + "specify an engine manually." + ) + elif engine == "xlrd": + from xlrd import XLRDError + + error = XLRDError + msg = ( + "Unsupported format, or corrupt file: Expected BOF " + "record; found b'foo'" + ) else: error = BadZipFile msg = "File is not a zip file" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index bf0a0de442ae19..2bb9ba2a397be9 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,3 +1,5 @@ +import io + import pytest from pandas.compat._optional import import_optional_dependency @@ -8,6 +10,7 @@ from pandas.util.version import Version from pandas.io.excel import ExcelFile +from pandas.io.excel._base import inspect_excel_format xlrd = pytest.importorskip("xlrd") xlwt = pytest.importorskip("xlwt") @@ -78,3 +81,18 @@ def test_read_excel_warning_with_xlsx_file(datapath): else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None) + + +@pytest.mark.parametrize( + "file_header", + [ + b"\x09\x00\x04\x00\x07\x00\x10\x00", + b"\x09\x02\x06\x00\x00\x00\x10\x00", + b"\x09\x04\x06\x00\x00\x00\x10\x00", + b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", + ], +) +def test_read_old_xls_files(file_header): + # GH 41226 + f = io.BytesIO(file_header) + assert inspect_excel_format(f) == "xls"