Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a numpy engine for reading using numpy.genfromtxt() #452

Merged
merged 21 commits into from
Jun 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7b4c45f
Replace data section reader with pandas.read_csv
kinverarity1 Apr 17, 2021
63aa445
Add engine="pandas" kwarg to LASFile.read
kinverarity1 Apr 18, 2021
4246ce1
Fix GH CI bug?
kinverarity1 Apr 18, 2021
6dcf20a
Experimental exploration with numpy.genfromtxt
dcslagel Apr 18, 2021
57bf80f
Rebase and make separate engine for 'numpy' reader
dcslagel Apr 20, 2021
f5dc804
Remove pandas engine
kinverarity1 Apr 21, 2021
b65a0dd
Use normal engine for wrapped files
kinverarity1 Apr 21, 2021
0cbf3ed
Format code with black
kinverarity1 Apr 21, 2021
d3cea21
Remove pandas reader code
kinverarity1 Apr 21, 2021
6d4c0af
Update from branch 'master' to numpy-genfromtxt-explore
dcslagel Apr 21, 2021
a614c1c
Handle numpy-engine data read exceptions
dcslagel Apr 22, 2021
528bd81
Numpy-engine temp workarounds for failing tests
dcslagel Apr 23, 2021
db551e8
change 'f' formating to oldstyle for python 3.5
dcslagel Apr 24, 2021
0cb8d5b
Merge-Squash numpy-genfromtext-explore on merge-base
dcslagel May 1, 2021
e276941
Sync most of numpy-engine work to current master 91f8eab
dcslagel May 1, 2021
39117b3
Interm checkin add numpy-engine but hangs on a test
dcslagel May 1, 2021
b8b5bcb
Fix merge issues
dcslagel May 2, 2021
ac9e125
Sync with master's 91f8eab commit
dcslagel May 2, 2021
058b1bf
Handle single row data in np.genfromtxt
dcslagel May 26, 2021
b54f538
Return to exception fallback to old parser
dcslagel Jun 2, 2021
468a0c6
Add conditions for falling back to the normal parser
dcslagel Jun 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 83 additions & 25 deletions lasio/las.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def __init__(self, file_ref=None, **read_kwargs):
def read(
self,
file_ref,
ignore_data=False,
read_policy="default",
null_policy="strict",
ignore_header_errors=False,
ignore_comments=("#",),
ignore_data_comments="#",
mnemonic_case="upper",
ignore_data=False,
engine="numpy",
use_normal_engine_for_wrapped=True,
read_policy="default",
null_policy="strict",
index_unit=None,
dtypes="auto",
**kwargs
Expand All @@ -100,10 +102,6 @@ def read(
object, or a string containing the contents of a file.

Keyword Arguments:
null_policy (str or list): see
http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically
ignore_data (bool): if True, do not read in any of the actual data,
just the header metadata. False by default.
ignore_header_errors (bool): ignore LASHeaderErrors (False by
default)
ignore_comments (sequence/str): ignore lines beginning with these
Expand All @@ -113,6 +111,19 @@ def read(
mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
'upper': convert all HeaderItem mnemonics to uppercase
'lower': convert all HeaderItem mnemonics to lowercase
ignore_data (bool): if True, do not read in any of the actual data,
just the header metadata. False by default.
engine (str): "normal": parse data section with normal Python reader
(quite slow); "numpy": parse data section with `numpy.genfromtxt` (fast).
By default the engine is "numpy".
use_normal_engine_for_wrapped (bool): if header metadata indicates that
the file is wrapped, always use the 'normal' engine. Default is True.
The only reason you should use False is if speed is a very high priority
and you had files with metadata that incorrectly indicates they are
wrapped.
read_policy (): TODO
null_policy (str or list): see
http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically
index_unit (str): Optionally force-set the index curve's unit to "m" or "ft"
dtypes ("auto", dict or list): specify the data types for each curve in the
~ASCII data section. If "auto", each curve will be converted to floats if
Expand Down Expand Up @@ -151,7 +162,6 @@ def read(
regexp_subs, value_null_subs, version_NULL = reader.get_substitutions(
read_policy, null_policy
)

provisional_version = 2.0
provisional_wrapped = "YES"
provisional_null = None
Expand Down Expand Up @@ -257,6 +267,22 @@ def read(
las3_data_section_indices.append(i)

if not ignore_data:

# Override the default "numpy" parser with the 'normal' parser
# for these conditions:
# - file is wrapped
# - null_policy is not "strict"
# - dtypes is not "auto". Numpy can handle specified dtypes but
# the performance decays to the 'normal' performance level.

# normal engine.
if provisional_wrapped == "YES" or null_policy != "strict" or dtypes != "auto":
if engine != "normal":
logger.warning("Only engine='normal' can read wrapped files")
if use_normal_engine_for_wrapped:
engine = "normal"

# Check for the number of columns in each data section.
for k, first_line, last_line, section_title in [
section_positions[i] for i in data_section_indices
]:
Expand All @@ -283,23 +309,55 @@ def read(
dtypes = [dtypes.get(c.mnemonic, float) for c in self.curves]

# Notes see 2d9e43c3 and e960998f for 'try' background
try:
curves_data_gen = reader.read_data_section_iterative(
file_obj,
(first_line, last_line),
regexp_subs,
value_null_subs,
ignore_comments=ignore_data_comments,
n_columns=reader_n_columns,
dtypes=dtypes,
)
except KeyboardInterrupt:
raise
except:
raise exceptions.LASDataError(
traceback.format_exc()[:-1]
+ " in data section beginning line {}".format(i + 1)
)


# Attempt to read the data section
if engine == "numpy":
try:
curves_data_gen = reader.read_data_section_iterative_numpy_engine(
file_obj, (first_line, last_line)
)
except KeyboardInterrupt:
raise
except:
try:
file_obj.seek(k)
curves_data_gen = reader.read_data_section_iterative_normal_engine(
file_obj,
(first_line, last_line),
regexp_subs,
value_null_subs,
ignore_comments=ignore_data_comments,
n_columns=reader_n_columns,
dtypes=dtypes,
)
except KeyboardInterrupt:
raise
except:
raise exceptions.LASDataError(
traceback.format_exc()[:-1]
+ " in data section beginning line {}".format(i + 1)
)

if engine == "normal":
try:
curves_data_gen = reader.read_data_section_iterative_normal_engine(
file_obj,
(first_line, last_line),
regexp_subs,
value_null_subs,
ignore_comments=ignore_data_comments,
n_columns=reader_n_columns,
dtypes=dtypes,
)
except KeyboardInterrupt:
raise
except:
raise exceptions.LASDataError(
traceback.format_exc()[:-1]
+ " in data section beginning line {}".format(i + 1)
)


# Assign data to curves.
curve_idx = 0
Expand Down
38 changes: 37 additions & 1 deletion lasio/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, ignore_comments="#"):
return item_counts[0]


def read_data_section_iterative(
def read_data_section_iterative_normal_engine(
file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns, dtypes
):
"""Read data section into memory.
Expand Down Expand Up @@ -510,6 +510,42 @@ def identify_dtypes_from_data(row):
return dtypes_list


def read_data_section_iterative_numpy_engine(file_obj, line_nos):
"""Read data section into memory.

Arguments:
file_obj: file-like object open for reading at the beginning of the section
line_nos (tuple): the first and last line no of the section to read


Returns:
A numpy ndarray.
"""

first_line = line_nos[0] + 1
last_line = line_nos[1]
max_rows = last_line - first_line

file_obj.seek(0)

# unpack=True transforms the data from an array of rows to an array of columns.
# loose=False will throw an error on non-numerical data, which then sends the
# parsing to the 'normal' parser.
array = np.genfromtxt(
file_obj, skip_header=first_line, max_rows=max_rows, names=None, unpack=True, loose=False
)


# If there is only one data row, np.genfromtxt treats it as one array of
# individual values. Lasio needs a array of arrays. This if statement
# converts the single line data array to an array of arrays(column data).
if len(array.shape) == 1:
arr_len = array.shape[0]
array = array.reshape(arr_len,1)

return array


def get_substitutions(read_policy, null_policy):
"""Parse read and null policy definitions into a list of regexp and value
substitutions.
Expand Down
1 change: 1 addition & 0 deletions tests/test_null_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def test_null_policy_custom_2():


def test_null_policy_ERR_strict():
# Verify we can read-in text in a mostly numerical column
las = read(egfn("null_policy_ERR.las"), null_policy="strict")
assert las["RHOB"][2] == "ERR"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,4 @@ def test_sample_dtypes_specified_as_false():
def test_index_null_issue227():
las = lasio.examples.open("index_null.las")
assert las['DEPT'].data[1] == 999.25
assert numpy.isnan(las['DT'].data[0])
assert numpy.isnan(las['DT'].data[0])
1 change: 1 addition & 0 deletions tests/test_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
def read_file():
las = lasio.read(stegfn("1.2", "sample_big.las"))


def test_read_v12_sample_big(benchmark):
benchmark(read_file)