Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrap GMT_Put_Strings to pass str columns into GMT C API directly #520

Merged
merged 36 commits into from
Aug 10, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c811e65
Move test_put_* from test_clib to test_clib_put
weiji14 Jul 13, 2020
5c5f152
Initial wrapper for GMT_Put_Strings C API function
weiji14 Jul 13, 2020
d7c3053
Merge branch 'master' into gmt_put_strings
weiji14 Jul 14, 2020
fccedae
Try using GMT_TEXT for numpy string types
weiji14 Jul 14, 2020
07b95ed
Remove column argument from put_strings function
weiji14 Jul 14, 2020
88eef3a
Set valid GMT data mode as GMT_IS_OUTPUT
weiji14 Jun 22, 2020
6390e04
Try passing "GMT_IS_VECTOR" family type to put_strings
weiji14 Jul 14, 2020
42af00e
Do `put_vector` x and y before `put_strings` s, dim is 2 only
weiji14 Jul 14, 2020
9ef04b0
Refactor text to use virtualfile_from_vectors instead of pandas tempfile
weiji14 Jul 14, 2020
ab1e041
Split test_clib_put into separate _strings, _matrix and _vector files
weiji14 Jul 14, 2020
01754fc
Try using ctypes.POINTER in argtypes of c_put_strings
weiji14 Jul 14, 2020
452d3d3
Merge branch 'master' into gmt_put_strings
seisman Jul 22, 2020
9c8dad9
Merge branch 'master' into gmt_put_strings
seisman Jul 23, 2020
6dff9ba
Merge branch 'master' into gmt_put_strings
seisman Jul 27, 2020
cfb9124
Merge branch 'master' into gmt_put_strings
seisman Jul 27, 2020
2485437
Pass strings using "GMT_IS_VECTOR|GMT_IS_DUPLICATE"
seisman Jul 27, 2020
7fa1f0c
Merge branch 'master' into gmt_put_strings
seisman Jul 29, 2020
e8e7768
Use GMT_IS_VECTOR|GMT_IS_DUPLICATE when calling put_strings
seisman Jul 30, 2020
2f0c798
Improve the put_strings test
seisman Jul 31, 2020
6d6ea24
Merge branch 'master' into gmt_put_strings
seisman Aug 4, 2020
9d063b3
Add back import pandas as pd
weiji14 Aug 4, 2020
2d81f76
Merge branch 'master' into gmt_put_strings
seisman Aug 6, 2020
95afa90
Merge branch 'master' into gmt_put_strings
weiji14 Aug 9, 2020
ed0dce6
Revert refactor text to use virtualfile_from_vectors
weiji14 Aug 9, 2020
aeabd8e
Add test for passing in one string column to virtualfile_from_vectors
weiji14 Aug 9, 2020
abcbcf7
Refactor virtualfile_from_vectors to handle up to 2 string columns
weiji14 Aug 10, 2020
129135e
Fix test_plot_datetime by not passing first two columns into put_strings
weiji14 Aug 10, 2020
4f1ccda
Refactor to handle any number of string type columns
weiji14 Aug 10, 2020
30c18eb
Test for put_strings failing to increase code coverage
weiji14 Aug 10, 2020
62469cb
Expect failures for tests using GMT_Put_strings on GMT < 6.1.1
weiji14 Aug 10, 2020
26f51c9
Fix an incorrect note in virtualfile_from_vectors
weiji14 Aug 10, 2020
726de44
Concatenate last string columns instead of allowing arbitrary positions
weiji14 Aug 10, 2020
4c99477
Test variable length strings
weiji14 Aug 10, 2020
2f9d689
Replace gmt info with select in test_virtualfile_from_vectors_str_cols
weiji14 Aug 10, 2020
fe794c9
Fix truncated strings and an incorrect test
weiji14 Aug 10, 2020
f59d970
Replace gmt select with convert in test_virttualfile_from_vectors_str…
weiji14 Aug 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 10 additions & 25 deletions pygmt/base_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
Does not define any special non-GMT methods (savefig, show, etc).
"""
import contextlib
import csv
import numpy as np
import pandas as pd

from .clib import Session
from .exceptions import GMTInvalidInput
Expand All @@ -14,7 +12,6 @@
dummy_context,
data_kind,
fmt_docstring,
GMTTempFile,
use_alias,
kwargs_to_strings,
)
Expand Down Expand Up @@ -970,27 +967,15 @@ def text(
if position is not None and isinstance(position, str):
kwargs["F"] += f'+c{position}+t"{text}"'

with GMTTempFile(suffix=".txt") as tmpfile:
with Session() as lib:
fname = textfiles if kind == "file" else ""
if kind == "vectors":
if position is not None:
fname = ""
else:
pd.DataFrame.from_dict(
{
"x": np.atleast_1d(x),
"y": np.atleast_1d(y),
"text": np.atleast_1d(text),
}
).to_csv(
tmpfile.name,
sep="\t",
header=False,
index=False,
quoting=csv.QUOTE_NONE,
)
fname = tmpfile.name

with Session() as lib:
file_context = dummy_context(textfiles) if kind == "file" else ""
if kind == "vectors":
if position is not None:
file_context = dummy_context("")
else:
file_context = lib.virtualfile_from_vectors(
np.atleast_1d(x), np.atleast_1d(y), np.atleast_1d(text)
)
with file_context as fname:
arg_str = " ".join([fname, build_arg_string(kwargs)])
lib.call_module("text", arg_str)
79 changes: 68 additions & 11 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"GMT_IS_SURFACE",
]

MODES = ["GMT_CONTAINER_ONLY", "GMT_OUTPUT"]
MODES = ["GMT_CONTAINER_ONLY", "GMT_IS_OUTPUT"]

REGISTRATIONS = ["GMT_GRID_PIXEL_REG", "GMT_GRID_NODE_REG"]

Expand Down Expand Up @@ -235,7 +235,7 @@ def __getitem__(self, name):
value = c_get_enum(session, name.encode())

if value is None or value == -99999:
raise GMTCLibError("Constant '{}' doesn't exits in libgmt.".format(name))
raise GMTCLibError(f"Constant '{name}' doesn't exist in libgmt.")

return value

Expand Down Expand Up @@ -511,13 +511,13 @@ def create_data(self, family, geometry, mode, **kwargs):
----------
family : str
A valid GMT data family name (e.g., ``'GMT_IS_DATASET'``). See the
``data_families`` attribute for valid names.
``FAMILIES`` attribute for valid names.
geometry : str
A valid GMT data geometry name (e.g., ``'GMT_IS_POINT'``). See the
``data_geometries`` attribute for valid names.
``GEOMETRIES`` attribute for valid names.
mode : str
A valid GMT data mode (e.g., ``'GMT_OUTPUT'``). See the
``data_modes`` attribute for valid names.
A valid GMT data mode (e.g., ``'GMT_IS_OUTPUT'``). See the
``MODES`` attribute for valid names.
dim : list of 4 integers
The dimensions of the dataset. See the documentation for the GMT C
API function ``GMT_Create_Data`` (``src/gmt_api.c``) for the full
Expand Down Expand Up @@ -731,7 +731,7 @@ def put_vector(self, dataset, column, vector):
"""
Attach a numpy 1D array as a column on a GMT dataset.

Use this functions to attach numpy array data to a GMT dataset and pass
Use this function to attach numpy array data to a GMT dataset and pass
it to GMT modules. Wraps ``GMT_Put_Vector``.

The dataset must be created by :meth:`~gmt.clib.Session.create_data`
Expand Down Expand Up @@ -791,11 +791,61 @@ def put_vector(self, dataset, column, vector):
)
)

def put_strings(self, dataset, family, strings):
"""
Attach a numpy 1D array of dtype str as a column on a GMT dataset.

Use this function to attach string type numpy array data to a GMT
dataset and pass it to GMT modules. Wraps ``GMT_Put_Strings``.

The dataset must be created by :meth:`~gmt.clib.Session.create_data`
first.

.. warning::
The numpy array must be C contiguous in memory. If it comes from a
column slice of a 2d array, for example, you will have to make a
copy. Use :func:`numpy.ascontiguousarray` to make sure your vector
is contiguous (it won't copy if it already is).

Parameters
----------
dataset : :class:`ctypes.c_void_p`
The ctypes void pointer to a ``GMT_Dataset``. Create it with
:meth:`~gmt.clib.Session.create_data`.
family : str
The family type of the dataset. Can be either ``GMT_IS_VECTOR`` or
``GMT_IS_MATRIX``.
strings : numpy 1d-array
The array that will be attached to the dataset. Must be a 1d C
contiguous array.

Raises
------
GMTCLibError
If given invalid input or ``GMT_Put_Strings`` exits with status !=
0.

"""
c_put_strings = self.get_libgmt_func(
"GMT_Put_Strings",
argtypes=[ctp.c_void_p, ctp.c_uint, ctp.c_void_p, ctp.c_void_p],
restype=ctp.c_int,
)

strings_pointer = strings.ctypes.data_as(ctp.c_char_p)
status = c_put_strings(
self.session_pointer, self[family], dataset, strings_pointer
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following Python code can correctly pass strings array to the myfunc function in test.so library:

# ...
arr = (ctypes.c_char_p * len(string))()
arr[:] = [s.encode() for s in string]
lib.myfunc(arr)

The Python trick comes from https://stackoverflow.com/questions/3494598/. It works but I don't understand the details, and don't know if it's the simplest way.

Yes I tried this following your datetime example, but I think (?) it's functionally equivalent to strings.ctypes.data_as(ctp.c_char_p)? Either way it runs, but the string data doesn't seem to get written to the virtualfile so the test fails. I've also tried strings.ctypes.data_as(ctp.POINTER(ctp.c_char_p)) but it doesn't seem to make a difference. 😕

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strings.ctypes.data_as(ctp.c_char_p) may not work.

What I did is adding a print statement inside the GMT_Put_Strings function, printing the string arrays passed to it. Using strings.ctypes.data_as(ctp.c_char_p) crashes immediately for me, but using my code can print the strings correctly, but then crash 🤦‍♂️.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about using:

c_put_strings = self.get_libgmt_func(
    "GMT_Put_Strings",
    argtypes=[
        ctp.c_void_p,
        ctp.c_uint,
        ctp.c_void_p,
        ctp.POINTER(ctp.c_char_p),
    ],
    restype=ctp.c_int,
)
strings_pointer = (ctp.c_char_p * len(strings))()
strings_pointer[:] = np.char.encode(strings)

Does it crash for you?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it still crashes, but I can see that the strings are correctly passed to the GMT_Put_Strings function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about strings.ctypes.data_as(ctp.POINTER(ctp.c_char_p))? Are the strings correctly passed in?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I was just taking a look at it (thanks for tracking down the bug by the way!). Sounds like we'll need to pin to GMT > 6.1.1 for the next release?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I'm not 100% sure GenericMappingTools/gmt#3718 is related to the issue here. I just had the feeling that the string array may be freed by Python when GMT tries to read it.

Hopefully, GMT 6.1.1 can fix issue and #515.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, GenericMappingTools/gmt#3718 is already merged into GMT master, and will be backported to 6.1 branch soon.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! I'll test it out when I get the time (need to prepare some stuff for an online conference next week). Still need to wait on the grid problem at #515 but that's a separate issue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just triggered the GMT master branch testing. Ideally, we should only see one failure from test_put_strings.

if status != 0:
raise GMTCLibError(
f"Failed to put strings of type {strings.dtype} into dataset"
)

def put_matrix(self, dataset, matrix, pad=0):
"""
Attach a numpy 2D array to a GMT dataset.

Use this functions to attach numpy array data to a GMT dataset and pass
Use this function to attach numpy array data to a GMT dataset and pass
it to GMT modules. Wraps ``GMT_Put_Matrix``.

The dataset must be created by :meth:`~gmt.clib.Session.create_data`
Expand Down Expand Up @@ -856,12 +906,12 @@ def write_data(self, family, geometry, mode, wesn, output, data):
----------
family : str
A valid GMT data family name (e.g., ``'GMT_IS_DATASET'``). See the
``data_families`` attribute for valid names. Don't use the
``FAMILIES`` attribute for valid names. Don't use the
``GMT_VIA_VECTOR`` or ``GMT_VIA_MATRIX`` constructs for this. Use
``GMT_IS_VECTOR`` and ``GMT_IS_MATRIX`` instead.
geometry : str
A valid GMT data geometry name (e.g., ``'GMT_IS_POINT'``). See the
``data_geometries`` attribute for valid names.
``GEOMETRIES`` attribute for valid names.
mode : str
How the data is to be written to the file. This option varies
depending on the given family. See the GMT API documentation for
Expand Down Expand Up @@ -1085,6 +1135,9 @@ def virtualfile_from_vectors(self, *vectors):
arrays = vectors_to_arrays(vectors)

columns = len(arrays)
if np.issubdtype(arrays[-1].dtype, np.str_):
columns -= 1

rows = len(arrays[0])
if not all(len(i) == rows for i in arrays):
raise GMTInvalidInput("All arrays must have same size.")
Expand All @@ -1096,8 +1149,12 @@ def virtualfile_from_vectors(self, *vectors):
family, geometry, mode="GMT_CONTAINER_ONLY", dim=[columns, rows, 1, 0]
)

for col, array in enumerate(arrays):
# Use put_vector for first n columns with numerical type data
for col, array in enumerate(arrays[:columns]):
self.put_vector(dataset, column=col, vector=array)
# Use put_strings for last column with string type data
for array in arrays[columns:]:
self.put_strings(dataset, family="GMT_IS_VECTOR", strings=array)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These codes are incorrect. We can't call GMT_Put_Strings multiple times.

See the comments in #483 (comment), we have to combine all trailing string arrays into one array and call GMT_Put_Strings once.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does just call put strings once, or zero times. If you examine the code before, the slice either returns a list with one item or a list with zero items.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean these lines?

        columns = len(arrays)
        if np.issubdtype(arrays[-1].dtype, np.str_):
            columns -= 1

It won't work for input with multiple trailing string arrays. For example, for the text module, the input array may be

x = [1, 2]
y = [3, 4]
font = ["15p,1,blue", "15p,1,red"]
text = ["labelA", "labelB"]

We have to combine font and text arrays into one single string array before passing it via GMT_Put_Strings.

newarray = ["15p,1,blue labelA", "15p,1,red labelB"]

Copy link
Member Author

@weiji14 weiji14 Jul 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I see, but do we want to do the concatenation:

  1. Before passing it into lib.virtualfile_from_vectors, i.e. at the fig.text (and also fig.meca, etc) level
  2. In the clib itself, i.e. find all the arrays that have a 'str' data type, and concatenate them inside lib.virtualfile_from_vectors before passing it into put_strings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Option 2 is easier and more intuitive for module wrapper developers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not so easy for the clib developer though 😆. I do agree with going with Option 2, but I'm not sure how the memory pointers would work if we concatenate two numpy arrays with 'str' dtypes. It might be that we'll be duplicating memory somehow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I now about Python and GMT, I think we may have to duplicate the memory in PyGMT, and then duplicate it again in GMT.


with self.open_virtual_file(
family, geometry, "GMT_IN|GMT_IS_REFERENCE", dataset
Expand Down
153 changes: 0 additions & 153 deletions pygmt/tests/test_clib.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,159 +304,6 @@ def test_create_data_fails():
)


def test_put_vector():
"Check that assigning a numpy array to a dataset works"
dtypes = "float32 float64 int32 int64 uint32 uint64".split()
for dtype in dtypes:
with clib.Session() as lib:
dataset = lib.create_data(
family="GMT_IS_DATASET|GMT_VIA_VECTOR",
geometry="GMT_IS_POINT",
mode="GMT_CONTAINER_ONLY",
dim=[3, 5, 1, 0], # columns, rows, layers, dtype
)
x = np.array([1, 2, 3, 4, 5], dtype=dtype)
y = np.array([6, 7, 8, 9, 10], dtype=dtype)
z = np.array([11, 12, 13, 14, 15], dtype=dtype)
lib.put_vector(dataset, column=lib["GMT_X"], vector=x)
lib.put_vector(dataset, column=lib["GMT_Y"], vector=y)
lib.put_vector(dataset, column=lib["GMT_Z"], vector=z)
# Turns out wesn doesn't matter for Datasets
wesn = [0] * 6
# Save the data to a file to see if it's being accessed correctly
with GMTTempFile() as tmp_file:
lib.write_data(
"GMT_IS_VECTOR",
"GMT_IS_POINT",
"GMT_WRITE_SET",
wesn,
tmp_file.name,
dataset,
)
# Load the data and check that it's correct
newx, newy, newz = tmp_file.loadtxt(unpack=True, dtype=dtype)
npt.assert_allclose(newx, x)
npt.assert_allclose(newy, y)
npt.assert_allclose(newz, z)


def test_put_vector_invalid_dtype():
"Check that it fails with an exception for invalid data types"
with clib.Session() as lib:
dataset = lib.create_data(
family="GMT_IS_DATASET|GMT_VIA_VECTOR",
geometry="GMT_IS_POINT",
mode="GMT_CONTAINER_ONLY",
dim=[2, 3, 1, 0], # columns, rows, layers, dtype
)
data = np.array([37, 12, 556], dtype="object")
with pytest.raises(GMTInvalidInput):
lib.put_vector(dataset, column=1, vector=data)


def test_put_vector_wrong_column():
"Check that it fails with an exception when giving an invalid column"
with clib.Session() as lib:
dataset = lib.create_data(
family="GMT_IS_DATASET|GMT_VIA_VECTOR",
geometry="GMT_IS_POINT",
mode="GMT_CONTAINER_ONLY",
dim=[1, 3, 1, 0], # columns, rows, layers, dtype
)
data = np.array([37, 12, 556], dtype="float32")
with pytest.raises(GMTCLibError):
lib.put_vector(dataset, column=1, vector=data)


def test_put_vector_2d_fails():
"Check that it fails with an exception for multidimensional arrays"
with clib.Session() as lib:
dataset = lib.create_data(
family="GMT_IS_DATASET|GMT_VIA_VECTOR",
geometry="GMT_IS_POINT",
mode="GMT_CONTAINER_ONLY",
dim=[1, 6, 1, 0], # columns, rows, layers, dtype
)
data = np.array([[37, 12, 556], [37, 12, 556]], dtype="int32")
with pytest.raises(GMTInvalidInput):
lib.put_vector(dataset, column=0, vector=data)


def test_put_matrix():
"Check that assigning a numpy 2d array to a dataset works"
dtypes = "float32 float64 int32 int64 uint32 uint64".split()
shape = (3, 4)
for dtype in dtypes:
with clib.Session() as lib:
dataset = lib.create_data(
family="GMT_IS_DATASET|GMT_VIA_MATRIX",
geometry="GMT_IS_POINT",
mode="GMT_CONTAINER_ONLY",
dim=[shape[1], shape[0], 1, 0], # columns, rows, layers, dtype
)
data = np.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
lib.put_matrix(dataset, matrix=data)
# wesn doesn't matter for Datasets
wesn = [0] * 6
# Save the data to a file to see if it's being accessed correctly
with GMTTempFile() as tmp_file:
lib.write_data(
"GMT_IS_MATRIX",
"GMT_IS_POINT",
"GMT_WRITE_SET",
wesn,
tmp_file.name,
dataset,
)
# Load the data and check that it's correct
newdata = tmp_file.loadtxt(dtype=dtype)
npt.assert_allclose(newdata, data)


def test_put_matrix_fails():
"Check that put_matrix raises an exception if return code is not zero"
# It's hard to make put_matrix fail on the C API level because of all the
# checks on input arguments. Mock the C API function just to make sure it
# works.
with clib.Session() as lib:
with mock(lib, "GMT_Put_Matrix", returns=1):
with pytest.raises(GMTCLibError):
lib.put_matrix(dataset=None, matrix=np.empty((10, 2)), pad=0)


def test_put_matrix_grid():
"Check that assigning a numpy 2d array to a grid works"
dtypes = "float32 float64 int32 int64 uint32 uint64".split()
wesn = [10, 15, 30, 40, 0, 0]
inc = [1, 1]
shape = ((wesn[3] - wesn[2]) // inc[1] + 1, (wesn[1] - wesn[0]) // inc[0] + 1)
for dtype in dtypes:
with clib.Session() as lib:
grid = lib.create_data(
family="GMT_IS_GRID|GMT_VIA_MATRIX",
geometry="GMT_IS_SURFACE",
mode="GMT_CONTAINER_ONLY",
ranges=wesn[:4],
inc=inc,
registration="GMT_GRID_NODE_REG",
)
data = np.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
lib.put_matrix(grid, matrix=data)
# Save the data to a file to see if it's being accessed correctly
with GMTTempFile() as tmp_file:
lib.write_data(
"GMT_IS_MATRIX",
"GMT_IS_SURFACE",
"GMT_CONTAINER_AND_DATA",
wesn,
tmp_file.name,
grid,
)
# Load the data and check that it's correct
newdata = tmp_file.loadtxt(dtype=dtype)
npt.assert_allclose(newdata, data)


def test_virtual_file():
"Test passing in data via a virtual file with a Dataset"
dtypes = "float32 float64 int32 int64 uint32 uint64".split()
Expand Down
Loading