-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: loosen XLS signature #41321
ENH: loosen XLS signature #41321
Changes from 9 commits
c4bf8fb
95bf325
416ddd9
44053bb
1b1b648
3c11c74
9d0e8fa
3d79808
cbfa563
c113b20
f023c9d
955f679
d6206bc
c306f69
c741f32
8cc480e
03d6393
32b9d1d
9e9a8ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1014,16 +1014,22 @@ def close(self): | |
return content | ||
|
||
|
||
XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" | ||
ZIP_SIGNATURE = b"PK\x03\x04" | ||
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) | ||
SIGNATURES = { | ||
"biff2": b"\x09\x00\x04\x00", | ||
"biff3": b"\x09\x02\x06\x00", | ||
"biff4": b"\x09\x04\x06\x00", | ||
"biff5": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", | ||
"biff8": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the biff version matter? I think no - in which case can you make this a tuple with comments, e.g.
|
||
"zip": b"PK\x03\x04", | ||
} | ||
PEEK_SIZE = max(map(len, SIGNATURES.values())) | ||
|
||
|
||
@doc(storage_options=_shared_docs["storage_options"]) | ||
def inspect_excel_format( | ||
content_or_path: FilePathOrBuffer, | ||
storage_options: StorageOptions = None, | ||
) -> str: | ||
) -> str | None: | ||
""" | ||
Inspect the path or content of an excel file and get its format. | ||
|
||
|
@@ -1037,8 +1043,8 @@ def inspect_excel_format( | |
|
||
Returns | ||
------- | ||
str | ||
Format of file. | ||
str or None | ||
Format of file (if it can be determined) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: remove parentheses - the phrase is essential to the correctness of the statement and not supplemental. Also, period at the end. |
||
|
||
Raises | ||
------ | ||
|
@@ -1063,10 +1069,14 @@ def inspect_excel_format( | |
peek = buf | ||
stream.seek(0) | ||
|
||
if peek.startswith(XLS_SIGNATURE): | ||
if any( | ||
peek.startswith(signature) | ||
for (file_format, signature) in SIGNATURES.items() | ||
if file_format.startswith("biff") | ||
): | ||
return "xls" | ||
elif not peek.startswith(ZIP_SIGNATURE): | ||
raise ValueError("File is not a recognized excel file") | ||
elif not peek.startswith(SIGNATURES["zip"]): | ||
return None | ||
|
||
# ZipFile typing is overly-strict | ||
# https://github.com/python/typeshed/issues/4212 | ||
|
@@ -1174,8 +1184,12 @@ def __init__( | |
ext = inspect_excel_format( | ||
content_or_path=path_or_buffer, storage_options=storage_options | ||
) | ||
if ext is None: | ||
raise ValueError( | ||
"Excel file format cannot be determined, you must specify " | ||
"an engine manually." | ||
) | ||
|
||
# ext will always be valid, otherwise inspect_excel_format would raise | ||
engine = config.get_option(f"io.excel.{ext}.reader", silent=True) | ||
if engine == "auto": | ||
engine = get_default_engine(ext, mode="reader") | ||
|
@@ -1190,12 +1204,13 @@ def __init__( | |
path_or_buffer, storage_options=storage_options | ||
) | ||
|
||
if ext != "xls" and xlrd_version >= "2": | ||
# Pass through if ext is None, otherwise check if ext valid for xlrd | ||
if ext and ext != "xls" and xlrd_version >= "2": | ||
raise ValueError( | ||
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " | ||
f"only the xls format is supported. Install openpyxl instead." | ||
) | ||
elif ext != "xls": | ||
elif ext and ext != "xls": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if if this is genuine oversight then a test might be worth it for this case, and an errormessage, otherwise can ignore.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @attack68 thanks for looking over this! My understanding of the desired behavior was to go ahead and try reading it with the Any input is appreciated, thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe.. not sure what is the best solution really. just wanted to make sure the case was considered... someone else might have a good idea or just leave it with that which you stated! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the user is specifying the engine explicitly, I do not think we should raise an error/warning. They have already instructed what they want to do. |
||
caller = inspect.stack()[1] | ||
if ( | ||
caller.filename.endswith( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The usage of signatures is internal to the implementation; can you change to something like "Bug in read_excel not detecting older xls formats". If you'd like to call this an enhancement (I don't have an opinion either way), then this line should be moved to the enhancement section above.
Also, I think a separate line should be added addressing the bugfix here that we would raise when engine is specified but the excel format could not be determined.