From 7b4c45ffc5203d71e575ad9a80050bfc9506c03c Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Sat, 17 Apr 2021 21:13:38 +0930 Subject: [PATCH 01/18] Replace data section reader with pandas.read_csv --- lasio/las.py | 4 ++-- lasio/reader.py | 32 ++++++-------------------------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 3c61cbc..31157ce 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -10,6 +10,7 @@ import logging import re import sys +import traceback # get basestring in py3 @@ -369,8 +370,7 @@ def read( self.index_unit = None def update_start_stop_step(self, STRT=None, STOP=None, STEP=None, fmt="%.5f"): - """Configure or Change STRT, STOP, and STEP values - """ + """Configure or Change STRT, STOP, and STEP values""" if STRT is None: STRT = self.index[0] if STOP is None: diff --git a/lasio/reader.py b/lasio/reader.py index ce20f53..d8b4920 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -7,6 +7,7 @@ import urllib.request import numpy as np +import pandas as pd from . import defaults @@ -423,32 +424,11 @@ def read_data_section_iterative( title = file_obj.readline() - def items(f, start_line_no, end_line_no): - line_no = start_line_no - for line in f: - line_no += 1 - logger.debug( - "Line {}: reading data '{}'".format( - line_no + 1, line.strip("\n").strip() - ) - ) - if remove_line_filter(line): - continue - else: - for pattern, sub_str in regexp_subs: - line = re.sub(pattern, sub_str, line) - line = line.replace(chr(26), "") - for item in split_on_whitespace(line): - try: - yield np.float64(item) - except ValueError: - yield item - if line_no == end_line_no: - break - - array = np.array( - [i for i in items(file_obj, start_line_no=line_nos[0], end_line_no=line_nos[1])] - ) + nrows = (line_nos[1] - line_nos[0]) + 1 + + logger.debug("Read data section using pd.read_csv") + array = pd.read_csv(file_obj, skiprows=0, header=None, nrows=nrows, delim_whitespace=True).values + for value in value_null_subs: array[array == value] = np.nan return array From 63aa4457210ae64e54ddbdf90463ef5bfbbbc287 Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Sun, 18 Apr 2021 22:50:32 +0930 Subject: [PATCH 02/18] Add engine="pandas" kwarg to LASFile.read This commit adds the normal data section reader back into reader.py and also add some keyword arguments to LASFile.read. The behaviour is unchanged is engine == "normal" (default). If engine == "pandas", then: If the file is wrapped: if pandas_engine_wrapped_error is True (default): a LASDataError exception is raised. if False, a logger.warning message is emitted. The data section is then read using reader.py:read_data_section_iterative_pandas_engine() If an exception is raised in that function: If pandas_engine_error == "retry" (default): the data section will be re-read by the normal parser. Otherwise if it is "error": the exception will be raised. One problem is the pd.read_csv doesn't always raise an exception as we'd perhaps like it to. --- lasio/las.py | 113 ++++++++++++++++++++++++++++++++++++++---------- lasio/reader.py | 77 +++++++++++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 28 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 31157ce..85a26bd 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -81,15 +81,18 @@ def __init__(self, file_ref=None, **read_kwargs): def read( self, file_ref, - ignore_data=False, - read_policy="default", - null_policy="strict", ignore_header_errors=False, ignore_comments=("#",), mnemonic_case="upper", + ignore_data=False, + engine="normal", + pandas_engine_error="retry", + pandas_engine_wrapped_error=True, + read_policy="default", + null_policy="strict", index_unit=None, remove_data_line_filter="#", - **kwargs + **kwargs, ): """Read a LAS file. @@ -98,10 +101,6 @@ def read( object, or a string containing the contents of a file. Keyword Arguments: - null_policy (str or list): see - http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically - ignore_data (bool): if True, do not read in any of the actual data, - just the header metadata. False by default. ignore_header_errors (bool): ignore LASHeaderErrors (False by default) ignore_comments (tuple/str): ignore comments beginning with characters @@ -109,13 +108,28 @@ def read( mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics 'upper': convert all HeaderItem mnemonics to uppercase 'lower': convert all HeaderItem mnemonics to lowercase - index_unit (str): Optionally force-set the index curve's unit to "m" or "ft" + ignore_data (bool): if True, do not read in any of the actual data, + just the header metadata. False by default. + engine (str): "normal": parse data section with normal Python+numpy reader + (quite slow); "pandas": parse data section with `pandas.read_csv` + (fast, but read_policy and null_policy are ignored and + remove_data_line_filter can only accept a single character to + ignore lines based on the first character). + pandas_engine_error (str): what to do when the pandas engine encounters + an exception? Either "error": raise the exception; or "retry": + attempt to re-read the data section using the normal reader. + pandas_engine_wrapped_error (bool): raise Exception if pandas engine is + used to read a wrapped LAS file, default True. + read_policy (): TODO + null_policy (str or list): see + http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically remove_data_line_filter (str, func): string or function for removing/ignoring lines in the data section e.g. a function which accepts a string (a line from the data section) and returns either True (do not parse the line) or False (parse the line). If this argument is a string it will instead be converted to a function which rejects all lines starting with that value e.g. ``"#"`` will be converted to ``lambda line: line.strip().startswith("#")`` + index_unit (str): Optionally force-set the index curve's unit to "m" or "ft" See :func:`lasio.reader.open_with_codecs` for additional keyword arguments which help to manage issues relate to character encodings. @@ -124,6 +138,20 @@ def read( logger.debug("Reading {}...".format(str(file_ref))) + # Options specific to the pandas reader. + if engine == "pandas": + if isinstance(remove_data_line_filter, str): + remove_startswith = remove_data_line_filter + logger.debug( + f"Setting remove_startswith = '{remove_startswith}' for pandas engine" + ) + else: + logger.debug( + f"Not setting remove_startswith for pandas engine " + f" (don't understand {remove_data_line_filter}" + ) + remove_startswith = [] + file_obj = "" try: file_obj, self.encoding = reader.open_file(file_ref, **kwargs) @@ -256,22 +284,59 @@ def read( ) file_obj.seek(k) + + # Read data section. # Notes see 2d9e43c3 and e960998f for 'try' background - try: - arr = reader.read_data_section_iterative( - file_obj, - (first_line, last_line), - regexp_subs, - value_null_subs, - remove_line_filter=remove_data_line_filter, - ) - except KeyboardInterrupt: - raise - except: - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) + if engine == "pandas": + run_normal_engine = False + + # Issue a warning if pandas engine attempt to read wrapped file + if provisional_wrapped == "YES": + msg = f"{file_obj} is wrapped but engine='pandas' doesn't support wrapped files" + if pandas_engine_wrapped_error: + raise exceptions.LASDataError(msg) + else: + logger.warning(msg) + + try: + arr = reader.read_data_section_iterative_pandas_engine( + file_obj, + (first_line, last_line), + regexp_subs, + value_null_subs, + remove_startswith=remove_startswith, + ) + except KeyboardInterrupt: + raise + except: + if pandas_engine_error == "error": + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + elif pandas_engine_error == "retry": + run_normal_engine = True + + elif engine == "normal": + run_normal_engine = True + + if run_normal_engine: + try: + arr = reader.read_data_section_iterative_normal_engine( + file_obj, + (first_line, last_line), + regexp_subs, + value_null_subs, + remove_line_filter=remove_data_line_filter, + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + logger.debug("Read ndarray {arrshape}".format(arrshape=arr.shape)) # This is so we can check data size and use self.set_data(data, truncate=False) diff --git a/lasio/reader.py b/lasio/reader.py index d8b4920..40368b3 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -7,7 +7,6 @@ import urllib.request import numpy as np -import pandas as pd from . import defaults @@ -394,7 +393,7 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, remove_line_filter="#" return item_counts[0] -def read_data_section_iterative( +def read_data_section_iterative_normal_engine( file_obj, line_nos, regexp_subs, value_null_subs, remove_line_filter ): """Read data section into memory. @@ -419,15 +418,85 @@ def read_data_section_iterative( A 1-D numpy ndarray. """ - + logger.debug("Parsing data section with normal reader") remove_line_filter = convert_remove_line_filter(remove_line_filter) title = file_obj.readline() + def items(f, start_line_no, end_line_no): + line_no = start_line_no + for line in f: + line_no += 1 + logger.debug( + "Line {}: reading data '{}'".format( + line_no + 1, line.strip("\n").strip() + ) + ) + if remove_line_filter(line): + continue + else: + for pattern, sub_str in regexp_subs: + line = re.sub(pattern, sub_str, line) + line = line.replace(chr(26), "") + for item in split_on_whitespace(line): + try: + yield np.float64(item) + except ValueError: + yield item + if line_no == end_line_no: + break + + array = np.array( + [i for i in items(file_obj, start_line_no=line_nos[0], end_line_no=line_nos[1])] + ) + for value in value_null_subs: + array[array == value] = np.nan + return array + + +def read_data_section_iterative_pandas_engine( + file_obj, line_nos, regexp_subs, value_null_subs, remove_startswith=None +): + """Read data section into memory. + + Arguments: + file_obj: file-like object open for reading at the beginning of the section + line_nos (tuple): the first and last line no of the section to read + regexp_subs (list): each item should be a tuple of the pattern and + substitution string for a call to re.sub() on each line of the + data section. See defaults.py READ_SUBS and NULL_SUBS for examples. + value_null_subs (list): list of numerical values to be replaced by + numpy.nan values. + remove_startswith (str): reject all lines starting with that value e.g. ``"#"`` + + + Returns: + A 1-D numpy ndarray. + + """ + import pandas as pd + + logger.debug(f"regexp_subs: {regexp_subs}") + na_str_values = [pattern for pattern, sub in regexp_subs] + na_str_values = [] + + title = file_obj.readline() + nrows = (line_nos[1] - line_nos[0]) + 1 logger.debug("Read data section using pd.read_csv") - array = pd.read_csv(file_obj, skiprows=0, header=None, nrows=nrows, delim_whitespace=True).values + kws = {} + if remove_startswith: + kws["comment"] = remove_startswith + array = pd.read_csv( + file_obj, + skiprows=0, + header=None, + nrows=nrows, + delim_whitespace=True, + na_values=na_str_values, + **kws, + ).values for value in value_null_subs: array[array == value] = np.nan From 4246ce1beb43b4fe3a34b8983ac8856068cac92f Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Sun, 18 Apr 2021 23:00:14 +0930 Subject: [PATCH 03/18] Fix GH CI bug? --- lasio/las.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lasio/las.py b/lasio/las.py index 85a26bd..57e4a18 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -92,7 +92,7 @@ def read( null_policy="strict", index_unit=None, remove_data_line_filter="#", - **kwargs, + **kwargs ): """Read a LAS file. From 6dcf20a049bc77da6710135594d141c27a786aca Mon Sep 17 00:00:00 2001 From: dcslagel Date: Sun, 18 Apr 2021 13:58:56 -0600 Subject: [PATCH 04/18] Experimental exploration with numpy.genfromtxt This checkin is a quick hack to get an initial view of using numpy.genfromtext() for importing data sections. This checkin is based on the pandas-readcsv branch content and makes the following changes: - Set 'pandas' as the default engine. This is so we can run all the current tests with 'pandas (actually numpy.genfromtxt()) and get an intial view of any test failures. - Replace the actual 'pandas.read_csv(...)' call with numpy.genfromtxt() --- lasio/las.py | 2 +- lasio/reader.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lasio/las.py b/lasio/las.py index 57e4a18..1684a92 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -85,7 +85,7 @@ def read( ignore_comments=("#",), mnemonic_case="upper", ignore_data=False, - engine="normal", + engine="pandas", pandas_engine_error="retry", pandas_engine_wrapped_error=True, read_policy="default", diff --git a/lasio/reader.py b/lasio/reader.py index 40368b3..2271ce5 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -488,6 +488,8 @@ def read_data_section_iterative_pandas_engine( kws = {} if remove_startswith: kws["comment"] = remove_startswith + array = np.genfromtxt(file_obj) + """ array = pd.read_csv( file_obj, skiprows=0, @@ -497,6 +499,7 @@ def read_data_section_iterative_pandas_engine( na_values=na_str_values, **kws, ).values + """ for value in value_null_subs: array[array == value] = np.nan From 57bf80fbd688954e63552110a10d2bba630bd83a Mon Sep 17 00:00:00 2001 From: dcslagel Date: Tue, 20 Apr 2021 14:38:19 -0600 Subject: [PATCH 05/18] Rebase and make separate engine for 'numpy' reader --- lasio/las.py | 23 +++++++++++++++++++++-- lasio/reader.py | 30 +++++++++++++++++++++++++++--- lasio/writer.py | 18 +++++++++++++++++- tests/test_write.py | 2 +- 4 files changed, 66 insertions(+), 7 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 1684a92..649b919 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -66,6 +66,7 @@ def __init__(self, file_ref=None, **read_kwargs): super(LASFile, self).__init__() self._text = "" self.index_unit = None + self.index_initial = None default_items = defaults.get_default_items() self.sections = { "Version": default_items["Version"], @@ -85,7 +86,7 @@ def read( ignore_comments=("#",), mnemonic_case="upper", ignore_data=False, - engine="pandas", + engine="numpy", pandas_engine_error="retry", pandas_engine_wrapped_error=True, read_policy="default", @@ -287,7 +288,22 @@ def read( # Read data section. # Notes see 2d9e43c3 and e960998f for 'try' background - if engine == "pandas": + run_normal_engine = False + if engine == "numpy": + run_normal_engine = False + try: + arr = reader.read_data_section_iterative_numpy_engine( + file_obj, + (first_line, last_line) + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + elif engine == "pandas": run_normal_engine = False # Issue a warning if pandas engine attempt to read wrapped file @@ -434,6 +450,9 @@ def read( logger.warning("Conflicting index units found: {}".format(matches)) self.index_unit = None + if len(self.curves) > 0: + self.index_initial = self.index.copy() + def update_start_stop_step(self, STRT=None, STOP=None, STEP=None, fmt="%.5f"): """Configure or Change STRT, STOP, and STEP values""" if STRT is None: diff --git a/lasio/reader.py b/lasio/reader.py index 2271ce5..f5f3f50 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -454,6 +454,33 @@ def items(f, start_line_no, end_line_no): return array +def read_data_section_iterative_numpy_engine(file_obj, line_nos): + """Read data section into memory. + + Arguments: + file_obj: file-like object open for reading at the beginning of the section + line_nos (tuple): the first and last line no of the section to read + + + Returns: + A numpy ndarray. + """ + + first_line = line_nos[0] + 1 + last_line = line_nos[1] + max_rows = last_line - first_line + + file_obj.seek(0) + + array = np.genfromtxt( + file_obj, + skip_header=first_line, + max_rows=max_rows, + names=None + ) + return array + + def read_data_section_iterative_pandas_engine( file_obj, line_nos, regexp_subs, value_null_subs, remove_startswith=None ): @@ -488,8 +515,6 @@ def read_data_section_iterative_pandas_engine( kws = {} if remove_startswith: kws["comment"] = remove_startswith - array = np.genfromtxt(file_obj) - """ array = pd.read_csv( file_obj, skiprows=0, @@ -499,7 +524,6 @@ def read_data_section_iterative_pandas_engine( na_values=na_str_values, **kws, ).values - """ for value in value_null_subs: array[array == value] = np.nan diff --git a/lasio/writer.py b/lasio/writer.py index a228fad..b7353c6 100644 --- a/lasio/writer.py +++ b/lasio/writer.py @@ -103,7 +103,23 @@ def write( "VERS", "", 2.0, "CWLS log ASCII Standard -VERSION 2.0" ) - las.update_start_stop_step() + # ------------------------------------------------------------------------- + # If an initial curve index was not read from a las file (las.index_initial) + # or the curve index has changed during processing + # or if the STOP value doesn't match the final index value + # then update the step variables before writing to a new las file object. + # ------------------------------------------------------------------------- + index_changed = False + stop_is_different = False + + if las.index_initial is not None: + index_changed = not (las.index_initial == las.index).all() + stop_is_different = las.index_initial[-1] != las.well.STOP.value + else: + index_changed = True + + if index_changed or stop_is_different: + las.update_start_stop_step(STRT, STOP, STEP) # Write each section. # get_formatter_function ( ** get_section_widths ) diff --git a/tests/test_write.py b/tests/test_write.py index c94792d..2e40fd4 100644 --- a/tests/test_write.py +++ b/tests/test_write.py @@ -501,7 +501,7 @@ def test_write_single_step(): ~Well ------------------------------------------------------ STRT.M 1670.0 : START DEPTH STOP.M 1670.0 : STOP DEPTH -STEP.M 0 : STEP +STEP.M 0.0 : STEP NULL. -999.25 : NULL VALUE COMP. ANY OIL COMPANY INC. : COMPANY WELL. AAAAA_2 : WELL From f5dc804513a33808c0576ac5e93dc58eaeff2428 Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Wed, 21 Apr 2021 22:35:19 +0930 Subject: [PATCH 06/18] Remove pandas engine --- lasio/las.py | 65 ++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 649b919..4530bee 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -93,7 +93,7 @@ def read( null_policy="strict", index_unit=None, remove_data_line_filter="#", - **kwargs + **kwargs, ): """Read a LAS file. @@ -111,16 +111,14 @@ def read( 'lower': convert all HeaderItem mnemonics to lowercase ignore_data (bool): if True, do not read in any of the actual data, just the header metadata. False by default. - engine (str): "normal": parse data section with normal Python+numpy reader - (quite slow); "pandas": parse data section with `pandas.read_csv` - (fast, but read_policy and null_policy are ignored and - remove_data_line_filter can only accept a single character to - ignore lines based on the first character). - pandas_engine_error (str): what to do when the pandas engine encounters - an exception? Either "error": raise the exception; or "retry": - attempt to re-read the data section using the normal reader. - pandas_engine_wrapped_error (bool): raise Exception if pandas engine is - used to read a wrapped LAS file, default True. + engine (str): "normal": parse data section with normal Python reader + (quite slow); "numpy": parse data section with `numpy.genfromtxt` (fast). + By default the engine is "numpy". + use_normal_engine_for_wrapped (bool): if header metadata indicates that + the file is wrapped, always use the 'normal' engine. Default is True. + The only reason you should use False is if speed is a very high priority + and you had files with metadata that incorrectly indicates they are + wrapped. read_policy (): TODO null_policy (str or list): see http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically @@ -139,16 +137,16 @@ def read( logger.debug("Reading {}...".format(str(file_ref))) - # Options specific to the pandas reader. - if engine == "pandas": + # Options specific to the numpy reader. + if engine == "numpy": if isinstance(remove_data_line_filter, str): remove_startswith = remove_data_line_filter logger.debug( - f"Setting remove_startswith = '{remove_startswith}' for pandas engine" + f"Setting remove_startswith = '{remove_startswith}' for numpy engine" ) else: logger.debug( - f"Not setting remove_startswith for pandas engine " + f"Not setting remove_startswith for numpy engine " f" (don't understand {remove_data_line_filter}" ) remove_startswith = [] @@ -288,13 +286,14 @@ def read( # Read data section. # Notes see 2d9e43c3 and e960998f for 'try' background + run_normal_engine = False + + # Attempt to read the data section if engine == "numpy": - run_normal_engine = False try: arr = reader.read_data_section_iterative_numpy_engine( - file_obj, - (first_line, last_line) + file_obj, (first_line, last_line) ) except KeyboardInterrupt: raise @@ -303,36 +302,6 @@ def read( traceback.format_exc()[:-1] + " in data section beginning line {}".format(i + 1) ) - elif engine == "pandas": - run_normal_engine = False - - # Issue a warning if pandas engine attempt to read wrapped file - if provisional_wrapped == "YES": - msg = f"{file_obj} is wrapped but engine='pandas' doesn't support wrapped files" - if pandas_engine_wrapped_error: - raise exceptions.LASDataError(msg) - else: - logger.warning(msg) - - try: - arr = reader.read_data_section_iterative_pandas_engine( - file_obj, - (first_line, last_line), - regexp_subs, - value_null_subs, - remove_startswith=remove_startswith, - ) - except KeyboardInterrupt: - raise - except: - if pandas_engine_error == "error": - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) - elif pandas_engine_error == "retry": - run_normal_engine = True - elif engine == "normal": run_normal_engine = True From b65a0dd1807f6217022c01b4cfc0e84b370a7741 Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Wed, 21 Apr 2021 22:35:46 +0930 Subject: [PATCH 07/18] Use normal engine for wrapped files --- lasio/las.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lasio/las.py b/lasio/las.py index 4530bee..2bfac3f 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -87,6 +87,7 @@ def read( mnemonic_case="upper", ignore_data=False, engine="numpy", + use_normal_engine_for_wrapped=True, pandas_engine_error="retry", pandas_engine_wrapped_error=True, read_policy="default", @@ -269,6 +270,16 @@ def read( las3_data_section_indices.append(i) if not ignore_data: + + # Check whether file is wrapped and if so, attempt to use the + # normal engine. + if provisional_wrapped == "YES": + if engine != "normal": + logger.warning("Only engine='normal' can read wrapped files") + if use_normal_engine_for_wrapped: + engine = "normal" + + # Check for the number of columns in each data section. for k, first_line, last_line, section_title in [ section_positions[i] for i in data_section_indices ]: From 0cbf3ed60d9d21df3f0e19bbb2f98576c273a35b Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Wed, 21 Apr 2021 22:36:10 +0930 Subject: [PATCH 08/18] Format code with black --- lasio/reader.py | 5 +---- tests/test_speed.py | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lasio/reader.py b/lasio/reader.py index f5f3f50..c59b42c 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -473,10 +473,7 @@ def read_data_section_iterative_numpy_engine(file_obj, line_nos): file_obj.seek(0) array = np.genfromtxt( - file_obj, - skip_header=first_line, - max_rows=max_rows, - names=None + file_obj, skip_header=first_line, max_rows=max_rows, names=None ) return array diff --git a/tests/test_speed.py b/tests/test_speed.py index a02dd76..1319774 100644 --- a/tests/test_speed.py +++ b/tests/test_speed.py @@ -22,5 +22,6 @@ def read_file(): las = lasio.read(stegfn("1.2", "sample_big.las")) + def test_read_v12_sample_big(benchmark): benchmark(read_file) From d3cea21048dabd282e7d3513049d92677f853b25 Mon Sep 17 00:00:00 2001 From: Kent Inverarity Date: Wed, 21 Apr 2021 22:38:05 +0930 Subject: [PATCH 09/18] Remove pandas reader code --- lasio/reader.py | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/lasio/reader.py b/lasio/reader.py index c59b42c..9309b90 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -478,55 +478,6 @@ def read_data_section_iterative_numpy_engine(file_obj, line_nos): return array -def read_data_section_iterative_pandas_engine( - file_obj, line_nos, regexp_subs, value_null_subs, remove_startswith=None -): - """Read data section into memory. - - Arguments: - file_obj: file-like object open for reading at the beginning of the section - line_nos (tuple): the first and last line no of the section to read - regexp_subs (list): each item should be a tuple of the pattern and - substitution string for a call to re.sub() on each line of the - data section. See defaults.py READ_SUBS and NULL_SUBS for examples. - value_null_subs (list): list of numerical values to be replaced by - numpy.nan values. - remove_startswith (str): reject all lines starting with that value e.g. ``"#"`` - - - Returns: - A 1-D numpy ndarray. - - """ - import pandas as pd - - logger.debug(f"regexp_subs: {regexp_subs}") - na_str_values = [pattern for pattern, sub in regexp_subs] - na_str_values = [] - - title = file_obj.readline() - - nrows = (line_nos[1] - line_nos[0]) + 1 - - logger.debug("Read data section using pd.read_csv") - kws = {} - if remove_startswith: - kws["comment"] = remove_startswith - array = pd.read_csv( - file_obj, - skiprows=0, - header=None, - nrows=nrows, - delim_whitespace=True, - na_values=na_str_values, - **kws, - ).values - - for value in value_null_subs: - array[array == value] = np.nan - return array - - def get_substitutions(read_policy, null_policy): """Parse read and null policy definitions into a list of regexp and value substitutions. From a614c1c74e4ebcc3f84031dd26e157ed277bbab4 Mon Sep 17 00:00:00 2001 From: dcslagel Date: Thu, 22 Apr 2021 15:20:35 -0600 Subject: [PATCH 10/18] Handle numpy-engine data read exceptions - If numpy-engine throws an exception on data-read then retry with the normal engine. - Remove '_iterative' from the names of the data-read engine functions. --- lasio/las.py | 15 ++++++++------- lasio/reader.py | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 2bfac3f..b34227b 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -303,22 +303,23 @@ def read( # Attempt to read the data section if engine == "numpy": try: - arr = reader.read_data_section_iterative_numpy_engine( + arr = reader.read_data_section_numpy_engine( file_obj, (first_line, last_line) ) except KeyboardInterrupt: raise - except: - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) + except ValueError as err: + # If there is a ValueError, numpy-engine failed to read the data. + # So configure to re-try data-read with the normal-engine. + run_normal_engine = True + file_obj.seek(k) + elif engine == "normal": run_normal_engine = True if run_normal_engine: try: - arr = reader.read_data_section_iterative_normal_engine( + arr = reader.read_data_section_normal_engine( file_obj, (first_line, last_line), regexp_subs, diff --git a/lasio/reader.py b/lasio/reader.py index 9309b90..89b02b3 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -393,7 +393,7 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, remove_line_filter="#" return item_counts[0] -def read_data_section_iterative_normal_engine( +def read_data_section_normal_engine( file_obj, line_nos, regexp_subs, value_null_subs, remove_line_filter ): """Read data section into memory. @@ -454,7 +454,7 @@ def items(f, start_line_no, end_line_no): return array -def read_data_section_iterative_numpy_engine(file_obj, line_nos): +def read_data_section_numpy_engine(file_obj, line_nos): """Read data section into memory. Arguments: From 528bd81ff625183a150019c36d579fb60dfe60fd Mon Sep 17 00:00:00 2001 From: dcslagel Date: Fri, 23 Apr 2021 17:01:44 -0600 Subject: [PATCH 11/18] Numpy-engine temp workarounds for failing tests Make these temporary changes to enable integrating the numpy-engine. - Route "aggressive" and "all" null_policies to the normal-engine. - Set tests that fail for numpy-engine to XFAIL. These test will continue to pass for the normal-engine. - First draft of useing genfromtxts' usemap and missing_values to align functionality with the normal-engine. This needs follow work. --- lasio/las.py | 4 ++-- lasio/reader.py | 27 +++++++++++++++++++++++++-- tests/test_null_policy.py | 2 ++ tests/test_read.py | 5 +++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index b34227b..82489a7 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -94,7 +94,7 @@ def read( null_policy="strict", index_unit=None, remove_data_line_filter="#", - **kwargs, + **kwargs ): """Read a LAS file. @@ -304,7 +304,7 @@ def read( if engine == "numpy": try: arr = reader.read_data_section_numpy_engine( - file_obj, (first_line, last_line) + file_obj, (first_line, last_line), null_policy ) except KeyboardInterrupt: raise diff --git a/lasio/reader.py b/lasio/reader.py index 89b02b3..6ae65c3 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -454,7 +454,7 @@ def items(f, start_line_no, end_line_no): return array -def read_data_section_numpy_engine(file_obj, line_nos): +def read_data_section_numpy_engine(file_obj, line_nos, null_policy): """Read data section into memory. Arguments: @@ -472,8 +472,31 @@ def read_data_section_numpy_engine(file_obj, line_nos): file_obj.seek(0) + if isinstance(null_policy, str): + # TODO: + # This is a temporary solution until numpy-reader and null_policies are + # fully aligned. + if null_policy in ("aggressive", "all"): + raise ValueError + missing = defaults.NULL_POLICIES[null_policy] + elif isinstance(null_policy, list): + # TODO: + # Align numpy-reader and null_policies + # missing = null_policy + raise ValueError + else: + missing = defaults.NULL_POLICIES["none"] + + # TODO: Finish/Correct the usemask/missing_values implementation. + # We are currently passing a list to missing_values but it looks like + # missing_values only actually processes one item. array = np.genfromtxt( - file_obj, skip_header=first_line, max_rows=max_rows, names=None + file_obj, + skip_header=first_line, + max_rows=max_rows, + names=None, + usemask=True, + missing_values=missing, ) return array diff --git a/tests/test_null_policy.py b/tests/test_null_policy.py index b17e3b7..80355e2 100644 --- a/tests/test_null_policy.py +++ b/tests/test_null_policy.py @@ -77,7 +77,9 @@ def test_null_policy_custom_2(): assert las["SFLA"][2] == -999.25 +@pytest.mark.xfail(reason="TODO: need to fix for lasio's numpy-reader") def test_null_policy_ERR_strict(): + # Verify we can read-in text in a mostly numerical column las = read(egfn("null_policy_ERR.las"), null_policy="strict") assert las["RHOB"][2] == "ERR" diff --git a/tests/test_read.py b/tests/test_read.py index ed8d839..c62303b 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -276,6 +276,8 @@ def test_not_a_las_file(): las = lasio.read(egfn("not_a_las_file.las")) +# TODO: fix for numpy-reader +@pytest.mark.xfail def test_comma_decimal_mark_data(): las = lasio.read(egfn("comma_decimal_mark.las")) assert las["SFLU"][1] == 123.42 @@ -339,16 +341,19 @@ def test_emptyparam(capsys): assert not msg in out +@pytest.mark.xfail(reason="TODO: need to fix for lasio's numpy-reader") def test_data_characters_1(): las = lasio.read(egfn("data_characters.las")) assert las["TIME"][0] == "00:00:00" +@pytest.mark.xfail(reason="TODO: need to fix for lasio's numpy-reader") def test_data_characters_2(): las = lasio.read(egfn("data_characters.las")) assert las["DATE"][0] == "01-Jan-20" +@pytest.mark.xfail(reason="TODO: need to fix for lasio's numpy-reader") def test_data_characters_types(): from pandas.api.types import is_object_dtype from pandas.api.types import is_float_dtype From db551e89e5165b8a779a7eb6ded5c19488c94771 Mon Sep 17 00:00:00 2001 From: dcslagel Date: Sat, 24 Apr 2021 17:06:57 -0600 Subject: [PATCH 12/18] change 'f' formating to oldstyle for python 3.5 --- lasio/las.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 82489a7..47deb07 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -143,12 +143,12 @@ def read( if isinstance(remove_data_line_filter, str): remove_startswith = remove_data_line_filter logger.debug( - f"Setting remove_startswith = '{remove_startswith}' for numpy engine" + "Setting remove_startswith = '{}' for numpy engine".format(remove_startswith) ) else: logger.debug( - f"Not setting remove_startswith for numpy engine " - f" (don't understand {remove_data_line_filter}" + "Not setting remove_startswith for numpy engine " + " (don't understand {}".format(remove_startswith) ) remove_startswith = [] From 0cb8d5b26f11e64b067b6a9513fc7339f4b976cc Mon Sep 17 00:00:00 2001 From: dcslagel Date: Sat, 1 May 2021 10:02:37 -0600 Subject: [PATCH 13/18] Merge-Squash numpy-genfromtext-explore on merge-base --- lasio/las.py | 116 ++++++++++++++++++++++++++++++++++---------- lasio/reader.py | 28 ++++++++++- lasio/writer.py | 18 ++++++- tests/test_speed.py | 1 + tests/test_write.py | 2 +- 5 files changed, 135 insertions(+), 30 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 3c61cbc..2bfac3f 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -10,6 +10,7 @@ import logging import re import sys +import traceback # get basestring in py3 @@ -65,6 +66,7 @@ def __init__(self, file_ref=None, **read_kwargs): super(LASFile, self).__init__() self._text = "" self.index_unit = None + self.index_initial = None default_items = defaults.get_default_items() self.sections = { "Version": default_items["Version"], @@ -80,15 +82,19 @@ def __init__(self, file_ref=None, **read_kwargs): def read( self, file_ref, - ignore_data=False, - read_policy="default", - null_policy="strict", ignore_header_errors=False, ignore_comments=("#",), mnemonic_case="upper", + ignore_data=False, + engine="numpy", + use_normal_engine_for_wrapped=True, + pandas_engine_error="retry", + pandas_engine_wrapped_error=True, + read_policy="default", + null_policy="strict", index_unit=None, remove_data_line_filter="#", - **kwargs + **kwargs, ): """Read a LAS file. @@ -97,10 +103,6 @@ def read( object, or a string containing the contents of a file. Keyword Arguments: - null_policy (str or list): see - http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically - ignore_data (bool): if True, do not read in any of the actual data, - just the header metadata. False by default. ignore_header_errors (bool): ignore LASHeaderErrors (False by default) ignore_comments (tuple/str): ignore comments beginning with characters @@ -108,13 +110,26 @@ def read( mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics 'upper': convert all HeaderItem mnemonics to uppercase 'lower': convert all HeaderItem mnemonics to lowercase - index_unit (str): Optionally force-set the index curve's unit to "m" or "ft" + ignore_data (bool): if True, do not read in any of the actual data, + just the header metadata. False by default. + engine (str): "normal": parse data section with normal Python reader + (quite slow); "numpy": parse data section with `numpy.genfromtxt` (fast). + By default the engine is "numpy". + use_normal_engine_for_wrapped (bool): if header metadata indicates that + the file is wrapped, always use the 'normal' engine. Default is True. + The only reason you should use False is if speed is a very high priority + and you had files with metadata that incorrectly indicates they are + wrapped. + read_policy (): TODO + null_policy (str or list): see + http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically remove_data_line_filter (str, func): string or function for removing/ignoring lines in the data section e.g. a function which accepts a string (a line from the data section) and returns either True (do not parse the line) or False (parse the line). If this argument is a string it will instead be converted to a function which rejects all lines starting with that value e.g. ``"#"`` will be converted to ``lambda line: line.strip().startswith("#")`` + index_unit (str): Optionally force-set the index curve's unit to "m" or "ft" See :func:`lasio.reader.open_with_codecs` for additional keyword arguments which help to manage issues relate to character encodings. @@ -123,6 +138,20 @@ def read( logger.debug("Reading {}...".format(str(file_ref))) + # Options specific to the numpy reader. + if engine == "numpy": + if isinstance(remove_data_line_filter, str): + remove_startswith = remove_data_line_filter + logger.debug( + f"Setting remove_startswith = '{remove_startswith}' for numpy engine" + ) + else: + logger.debug( + f"Not setting remove_startswith for numpy engine " + f" (don't understand {remove_data_line_filter}" + ) + remove_startswith = [] + file_obj = "" try: file_obj, self.encoding = reader.open_file(file_ref, **kwargs) @@ -241,6 +270,16 @@ def read( las3_data_section_indices.append(i) if not ignore_data: + + # Check whether file is wrapped and if so, attempt to use the + # normal engine. + if provisional_wrapped == "YES": + if engine != "normal": + logger.warning("Only engine='normal' can read wrapped files") + if use_normal_engine_for_wrapped: + engine = "normal" + + # Check for the number of columns in each data section. for k, first_line, last_line, section_title in [ section_positions[i] for i in data_section_indices ]: @@ -255,22 +294,45 @@ def read( ) file_obj.seek(k) + + # Read data section. # Notes see 2d9e43c3 and e960998f for 'try' background - try: - arr = reader.read_data_section_iterative( - file_obj, - (first_line, last_line), - regexp_subs, - value_null_subs, - remove_line_filter=remove_data_line_filter, - ) - except KeyboardInterrupt: - raise - except: - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) + + run_normal_engine = False + + # Attempt to read the data section + if engine == "numpy": + try: + arr = reader.read_data_section_iterative_numpy_engine( + file_obj, (first_line, last_line) + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + elif engine == "normal": + run_normal_engine = True + + if run_normal_engine: + try: + arr = reader.read_data_section_iterative_normal_engine( + file_obj, + (first_line, last_line), + regexp_subs, + value_null_subs, + remove_line_filter=remove_data_line_filter, + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + logger.debug("Read ndarray {arrshape}".format(arrshape=arr.shape)) # This is so we can check data size and use self.set_data(data, truncate=False) @@ -368,9 +430,11 @@ def read( logger.warning("Conflicting index units found: {}".format(matches)) self.index_unit = None + if len(self.curves) > 0: + self.index_initial = self.index.copy() + def update_start_stop_step(self, STRT=None, STOP=None, STEP=None, fmt="%.5f"): - """Configure or Change STRT, STOP, and STEP values - """ + """Configure or Change STRT, STOP, and STEP values""" if STRT is None: STRT = self.index[0] if STOP is None: diff --git a/lasio/reader.py b/lasio/reader.py index ce20f53..9309b90 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -393,7 +393,7 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, remove_line_filter="#" return item_counts[0] -def read_data_section_iterative( +def read_data_section_iterative_normal_engine( file_obj, line_nos, regexp_subs, value_null_subs, remove_line_filter ): """Read data section into memory. @@ -418,7 +418,7 @@ def read_data_section_iterative( A 1-D numpy ndarray. """ - + logger.debug("Parsing data section with normal reader") remove_line_filter = convert_remove_line_filter(remove_line_filter) title = file_obj.readline() @@ -454,6 +454,30 @@ def items(f, start_line_no, end_line_no): return array +def read_data_section_iterative_numpy_engine(file_obj, line_nos): + """Read data section into memory. + + Arguments: + file_obj: file-like object open for reading at the beginning of the section + line_nos (tuple): the first and last line no of the section to read + + + Returns: + A numpy ndarray. + """ + + first_line = line_nos[0] + 1 + last_line = line_nos[1] + max_rows = last_line - first_line + + file_obj.seek(0) + + array = np.genfromtxt( + file_obj, skip_header=first_line, max_rows=max_rows, names=None + ) + return array + + def get_substitutions(read_policy, null_policy): """Parse read and null policy definitions into a list of regexp and value substitutions. diff --git a/lasio/writer.py b/lasio/writer.py index a228fad..b7353c6 100644 --- a/lasio/writer.py +++ b/lasio/writer.py @@ -103,7 +103,23 @@ def write( "VERS", "", 2.0, "CWLS log ASCII Standard -VERSION 2.0" ) - las.update_start_stop_step() + # ------------------------------------------------------------------------- + # If an initial curve index was not read from a las file (las.index_initial) + # or the curve index has changed during processing + # or if the STOP value doesn't match the final index value + # then update the step variables before writing to a new las file object. + # ------------------------------------------------------------------------- + index_changed = False + stop_is_different = False + + if las.index_initial is not None: + index_changed = not (las.index_initial == las.index).all() + stop_is_different = las.index_initial[-1] != las.well.STOP.value + else: + index_changed = True + + if index_changed or stop_is_different: + las.update_start_stop_step(STRT, STOP, STEP) # Write each section. # get_formatter_function ( ** get_section_widths ) diff --git a/tests/test_speed.py b/tests/test_speed.py index a02dd76..1319774 100644 --- a/tests/test_speed.py +++ b/tests/test_speed.py @@ -22,5 +22,6 @@ def read_file(): las = lasio.read(stegfn("1.2", "sample_big.las")) + def test_read_v12_sample_big(benchmark): benchmark(read_file) diff --git a/tests/test_write.py b/tests/test_write.py index c94792d..2e40fd4 100644 --- a/tests/test_write.py +++ b/tests/test_write.py @@ -501,7 +501,7 @@ def test_write_single_step(): ~Well ------------------------------------------------------ STRT.M 1670.0 : START DEPTH STOP.M 1670.0 : STOP DEPTH -STEP.M 0 : STEP +STEP.M 0.0 : STEP NULL. -999.25 : NULL VALUE COMP. ANY OIL COMPANY INC. : COMPANY WELL. AAAAA_2 : WELL From 39117b3bf2a6ccf653c23822c18fb87a71e1a334 Mon Sep 17 00:00:00 2001 From: dcslagel Date: Sat, 1 May 2021 13:00:51 -0600 Subject: [PATCH 14/18] Interm checkin add numpy-engine but hangs on a test This merge hangs on the following test. tests/test_enhancements.py::test_autodepthindex_point_one_inch --- lasio/las.py | 52 +++++++++++++++++++++++++++++++++---------------- lasio/reader.py | 2 +- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 80b4594..6bca13f 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -312,23 +312,41 @@ def read( dtypes = [dtypes.get(c.mnemonic, float) for c in self.curves] # Notes see 2d9e43c3 and e960998f for 'try' background - try: - curves_data_gen = reader.read_data_section_iterative( - file_obj, - (first_line, last_line), - regexp_subs, - value_null_subs, - ignore_comments=ignore_data_comments, - n_columns=reader_n_columns, - dtypes=dtypes, - ) - except KeyboardInterrupt: - raise - except: - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) + + run_normal_engine = False + + # Attempt to read the data section + if engine == "numpy": + try: + curves_data_gen = reader.read_data_section_iterative_numpy_engine( + file_obj, (first_line, last_line) + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) + elif engine == "normal": + run_normal_engine = True + + if run_normal_engine: + try: + curves_data_gen = reader.read_data_section_iterative_normal_engine( + file_obj, + (first_line, last_line), + regexp_subs, + value_null_subs, + remove_line_filter=remove_data_line_filter, + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) # Assign data to curves. curve_idx = 0 diff --git a/lasio/reader.py b/lasio/reader.py index dce0c01..4ec58a4 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -369,7 +369,7 @@ def inspect_data_section(file_obj, line_nos, regexp_subs, ignore_comments="#"): return item_counts[0] -def read_data_section_iterative( +def read_data_section_iterative_normal_engine( file_obj, line_nos, regexp_subs, value_null_subs, ignore_comments, n_columns, dtypes ): """Read data section into memory. From b8b5bcbe7e2c12198777519da07d2692ef8bd051 Mon Sep 17 00:00:00 2001 From: dcslagel Date: Sun, 2 May 2021 14:12:18 -0600 Subject: [PATCH 15/18] Fix merge issues - Remove unneeded 'run_normal_engine' - remove remove_data_line_filter - Move curve_data_gen Transform to numpy-eng - Transpose curve_data_gen to pass test_autodepthindex --- lasio/las.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 6bca13f..b1474dd 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -126,12 +126,6 @@ def read( read_policy (): TODO null_policy (str or list): see http://lasio.readthedocs.io/en/latest/data-section.html#handling-invalid-data-indicators-automatically - remove_data_line_filter (str, func): string or function for removing/ignoring lines - in the data section e.g. a function which accepts a string (a line from the - data section) and returns either True (do not parse the line) or False - (parse the line). If this argument is a string it will instead be converted - to a function which rejects all lines starting with that value e.g. ``"#"`` - will be converted to ``lambda line: line.strip().startswith("#")`` index_unit (str): Optionally force-set the index curve's unit to "m" or "ft" dtypes ("auto", dict or list): specify the data types for each curve in the ~ASCII data section. If "auto", each curve will be converted to floats if @@ -313,7 +307,6 @@ def read( # Notes see 2d9e43c3 and e960998f for 'try' background - run_normal_engine = False # Attempt to read the data section if engine == "numpy": @@ -321,6 +314,8 @@ def read( curves_data_gen = reader.read_data_section_iterative_numpy_engine( file_obj, (first_line, last_line) ) + # TODO: fix read_data_section_iterative_numpy_engine() so we don't need this. + curves_data_gen = curves_data_gen.T except KeyboardInterrupt: raise except: @@ -328,17 +323,17 @@ def read( traceback.format_exc()[:-1] + " in data section beginning line {}".format(i + 1) ) - elif engine == "normal": - run_normal_engine = True - if run_normal_engine: + if engine == "normal": try: curves_data_gen = reader.read_data_section_iterative_normal_engine( file_obj, (first_line, last_line), regexp_subs, value_null_subs, - remove_line_filter=remove_data_line_filter, + ignore_comments=ignore_data_comments, + n_columns=reader_n_columns, + dtypes=dtypes, ) except KeyboardInterrupt: raise @@ -348,6 +343,7 @@ def read( + " in data section beginning line {}".format(i + 1) ) + # Assign data to curves. curve_idx = 0 for curve_arr in curves_data_gen: From 058b1bfdcc270a04ff41c46e64a29fb80ba4b628 Mon Sep 17 00:00:00 2001 From: dcslagel Date: Wed, 26 May 2021 17:03:38 -0600 Subject: [PATCH 16/18] Handle single row data in np.genfromtxt --- lasio/las.py | 4 ---- lasio/reader.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index b1474dd..2ef5a6f 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -89,8 +89,6 @@ def read( ignore_data=False, engine="numpy", use_normal_engine_for_wrapped=True, - pandas_engine_error="retry", - pandas_engine_wrapped_error=True, read_policy="default", null_policy="strict", index_unit=None, @@ -314,8 +312,6 @@ def read( curves_data_gen = reader.read_data_section_iterative_numpy_engine( file_obj, (first_line, last_line) ) - # TODO: fix read_data_section_iterative_numpy_engine() so we don't need this. - curves_data_gen = curves_data_gen.T except KeyboardInterrupt: raise except: diff --git a/lasio/reader.py b/lasio/reader.py index 4ec58a4..6955c78 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -528,9 +528,18 @@ def read_data_section_iterative_numpy_engine(file_obj, line_nos): file_obj.seek(0) + # unpack=True tranforms the data from an array of rows to an array of columns. array = np.genfromtxt( - file_obj, skip_header=first_line, max_rows=max_rows, names=None + file_obj, skip_header=first_line, max_rows=max_rows, names=None, unpack=True ) + + # If there is only one data row, np.genfromtxt treats it as one array of + # individual values. Lasio needs a array of arrays. This if statement + # converts the single line data array to an array of arrays(column data). + if len(array.shape) == 1: + arr_len = array.shape[0] + array = array.reshape(arr_len,1) + return array From b54f53808dc7de2ff49f45fb98db38ad2f2bd66c Mon Sep 17 00:00:00 2001 From: dcslagel Date: Wed, 2 Jun 2021 13:58:45 -0600 Subject: [PATCH 17/18] Return to exception fallback to old parser --- lasio/las.py | 22 ++++++++++++++++++---- lasio/reader.py | 1 + 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index 2ef5a6f..f41cdc3 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -315,10 +315,24 @@ def read( except KeyboardInterrupt: raise except: - raise exceptions.LASDataError( - traceback.format_exc()[:-1] - + " in data section beginning line {}".format(i + 1) - ) + try: + file_obj.seek(k) + curves_data_gen = reader.read_data_section_iterative_normal_engine( + file_obj, + (first_line, last_line), + regexp_subs, + value_null_subs, + ignore_comments=ignore_data_comments, + n_columns=reader_n_columns, + dtypes=dtypes, + ) + except KeyboardInterrupt: + raise + except: + raise exceptions.LASDataError( + traceback.format_exc()[:-1] + + " in data section beginning line {}".format(i + 1) + ) if engine == "normal": try: diff --git a/lasio/reader.py b/lasio/reader.py index 6955c78..118aa4e 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -533,6 +533,7 @@ def read_data_section_iterative_numpy_engine(file_obj, line_nos): file_obj, skip_header=first_line, max_rows=max_rows, names=None, unpack=True ) + # If there is only one data row, np.genfromtxt treats it as one array of # individual values. Lasio needs a array of arrays. This if statement # converts the single line data array to an array of arrays(column data). From 468a0c6ffd11c9eb8995d51ffffd8df1b8651b0e Mon Sep 17 00:00:00 2001 From: dcslagel Date: Fri, 11 Jun 2021 15:10:26 -0600 Subject: [PATCH 18/18] Add conditions for falling back to the normal parser --- lasio/las.py | 11 ++++++++--- lasio/reader.py | 6 ++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lasio/las.py b/lasio/las.py index f41cdc3..122b45d 100644 --- a/lasio/las.py +++ b/lasio/las.py @@ -162,7 +162,6 @@ def read( regexp_subs, value_null_subs, version_NULL = reader.get_substitutions( read_policy, null_policy ) - provisional_version = 2.0 provisional_wrapped = "YES" provisional_null = None @@ -269,9 +268,15 @@ def read( if not ignore_data: - # Check whether file is wrapped and if so, attempt to use the + # Override the default "numpy" parser with the 'normal' parser + # for these conditions: + # - file is wrapped + # - null_policy is not "strict" + # - dtypes is not "auto". Numpy can handle specified dtypes but + # the performance decays to the 'normal' performance level. + # normal engine. - if provisional_wrapped == "YES": + if provisional_wrapped == "YES" or null_policy != "strict" or dtypes != "auto": if engine != "normal": logger.warning("Only engine='normal' can read wrapped files") if use_normal_engine_for_wrapped: diff --git a/lasio/reader.py b/lasio/reader.py index 118aa4e..82f8172 100644 --- a/lasio/reader.py +++ b/lasio/reader.py @@ -528,9 +528,11 @@ def read_data_section_iterative_numpy_engine(file_obj, line_nos): file_obj.seek(0) - # unpack=True tranforms the data from an array of rows to an array of columns. + # unpack=True transforms the data from an array of rows to an array of columns. + # loose=False will throw an error on non-numerical data, which then sends the + # parsing to the 'normal' parser. array = np.genfromtxt( - file_obj, skip_header=first_line, max_rows=max_rows, names=None, unpack=True + file_obj, skip_header=first_line, max_rows=max_rows, names=None, unpack=True, loose=False )