From 40d6d4797f26f627c9cbc9516c5bdfbb485a549b Mon Sep 17 00:00:00 2001 From: Fabio Valentini Date: Sun, 5 Mar 2023 20:03:14 +0100 Subject: [PATCH] Add wrapper for PyASCIIObject state bitfield accesses based on bindgen --- newsfragments/3015.added.md | 1 + pyo3-ffi/src/cpython/unicodeobject.rs | 273 +++++++++++++++++++++++--- src/ffi/tests.rs | 17 +- src/types/mod.rs | 2 +- src/types/string.rs | 20 +- 5 files changed, 268 insertions(+), 45 deletions(-) create mode 100644 newsfragments/3015.added.md diff --git a/newsfragments/3015.added.md b/newsfragments/3015.added.md new file mode 100644 index 00000000000..131aefd98ff --- /dev/null +++ b/newsfragments/3015.added.md @@ -0,0 +1 @@ +Support `PyASCIIObject` / `PyUnicode` and associated methods on big-endian architectures. \ No newline at end of file diff --git a/pyo3-ffi/src/cpython/unicodeobject.rs b/pyo3-ffi/src/cpython/unicodeobject.rs index 9ec21bf6989..27b55ef8309 100644 --- a/pyo3-ffi/src/cpython/unicodeobject.rs +++ b/pyo3-ffi/src/cpython/unicodeobject.rs @@ -30,6 +30,184 @@ use std::os::raw::{c_char, c_int, c_uint, c_void}; // skipped Py_UNICODE_HIGH_SURROGATE // skipped Py_UNICODE_LOW_SURROGATE +// generated by bindgen v0.63.0 (with small adaptations) +#[repr(C)] +struct BitfieldUnit { + storage: Storage, +} + +impl BitfieldUnit { + #[inline] + pub const fn new(storage: Storage) -> Self { + Self { storage } + } +} + +impl BitfieldUnit +where + Storage: AsRef<[u8]> + AsMut<[u8]>, +{ + #[inline] + fn get_bit(&self, index: usize) -> bool { + debug_assert!(index / 8 < self.storage.as_ref().len()); + let byte_index = index / 8; + let byte = self.storage.as_ref()[byte_index]; + let bit_index = if cfg!(target_endian = "big") { + 7 - (index % 8) + } else { + index % 8 + }; + let mask = 1 << bit_index; + byte & mask == mask + } + + #[inline] + fn set_bit(&mut self, index: usize, val: bool) { + debug_assert!(index / 8 < self.storage.as_ref().len()); + let byte_index = index / 8; + let byte = &mut self.storage.as_mut()[byte_index]; + let bit_index = if cfg!(target_endian = "big") { + 7 - (index % 8) + } else { + index % 8 + }; + let mask = 1 << bit_index; + if val { + *byte |= mask; + } else { + *byte &= !mask; + } + } + + #[inline] + fn get(&self, bit_offset: usize, bit_width: u8) -> u64 { + debug_assert!(bit_width <= 64); + debug_assert!(bit_offset / 8 < self.storage.as_ref().len()); + debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len()); + let mut val = 0; + for i in 0..(bit_width as usize) { + if self.get_bit(i + bit_offset) { + let index = if cfg!(target_endian = "big") { + bit_width as usize - 1 - i + } else { + i + }; + val |= 1 << index; + } + } + val + } + + #[inline] + fn set(&mut self, bit_offset: usize, bit_width: u8, val: u64) { + debug_assert!(bit_width <= 64); + debug_assert!(bit_offset / 8 < self.storage.as_ref().len()); + debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len()); + for i in 0..(bit_width as usize) { + let mask = 1 << i; + let val_bit_is_set = val & mask == mask; + let index = if cfg!(target_endian = "big") { + bit_width as usize - 1 - i + } else { + i + }; + self.set_bit(index + bit_offset, val_bit_is_set); + } + } +} + +// generated by bindgen v0.63.0 (with small adaptations) +// The same code is generated for Python 3.7, 3.8, 3.9, 3.10, and 3.11, but the "ready" field +// has been removed from Python 3.12. + +/// Wrapper around the `PyASCIIObject.state` bitfield with getters and setters that work +/// on most little- and big-endian architectures. +/// +/// Memory layout of C bitfields is implementation defined, so these functions are still +/// unsafe. Users must verify that they work as expected on the architectures they target. +#[repr(C)] +#[repr(align(4))] +struct PyASCIIObjectState { + _bitfield_align: [u8; 0], + _bitfield: BitfieldUnit<[u8; 4usize]>, +} + +// c_uint and u32 are not necessarily the same type on all targets / architectures +#[allow(clippy::useless_transmute)] +impl PyASCIIObjectState { + #[inline] + unsafe fn interned(&self) -> c_uint { + std::mem::transmute(self._bitfield.get(0usize, 2u8) as u32) + } + + #[inline] + unsafe fn set_interned(&mut self, val: c_uint) { + let val: u32 = std::mem::transmute(val); + self._bitfield.set(0usize, 2u8, val as u64) + } + + #[inline] + unsafe fn kind(&self) -> c_uint { + std::mem::transmute(self._bitfield.get(2usize, 3u8) as u32) + } + + #[inline] + unsafe fn set_kind(&mut self, val: c_uint) { + let val: u32 = std::mem::transmute(val); + self._bitfield.set(2usize, 3u8, val as u64) + } + + #[inline] + unsafe fn compact(&self) -> c_uint { + std::mem::transmute(self._bitfield.get(5usize, 1u8) as u32) + } + + #[inline] + unsafe fn set_compact(&mut self, val: c_uint) { + let val: u32 = std::mem::transmute(val); + self._bitfield.set(5usize, 1u8, val as u64) + } + + #[inline] + unsafe fn ascii(&self) -> c_uint { + std::mem::transmute(self._bitfield.get(6usize, 1u8) as u32) + } + + #[inline] + unsafe fn set_ascii(&mut self, val: c_uint) { + let val: u32 = std::mem::transmute(val); + self._bitfield.set(6usize, 1u8, val as u64) + } + + #[inline] + unsafe fn ready(&self) -> c_uint { + std::mem::transmute(self._bitfield.get(7usize, 1u8) as u32) + } + + #[inline] + unsafe fn set_ready(&mut self, val: c_uint) { + let val: u32 = std::mem::transmute(val); + self._bitfield.set(7usize, 1u8, val as u64) + } +} + +impl From for PyASCIIObjectState { + #[inline] + fn from(value: u32) -> Self { + PyASCIIObjectState { + _bitfield_align: [], + _bitfield: BitfieldUnit::new(value.to_ne_bytes()), + } + } +} + +impl From for u32 { + #[inline] + fn from(value: PyASCIIObjectState) -> Self { + u32::from_ne_bytes(value._bitfield.storage) + } +} + #[repr(C)] pub struct PyASCIIObject { pub ob_base: PyObject, @@ -52,34 +230,98 @@ pub struct PyASCIIObject { } /// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe. -/// -/// In addition, they are disabled on big-endian architectures to restrict this to most "common" -/// platforms, which are at least tested on CI and appear to be sound. -#[cfg(target_endian = "little")] impl PyASCIIObject { + /// Get the `interned` field of the [`PyASCIIObject`] state bitfield. + /// + /// Returns one of: [`SSTATE_NOT_INTERNED`], [`SSTATE_INTERNED_MORTAL`], [`SSTATE_INTERNED_IMMORTAL`] #[inline] pub unsafe fn interned(&self) -> c_uint { - self.state & 3 + PyASCIIObjectState::from(self.state).interned() } + /// Set the `interned` field of the [`PyASCIIObject`] state bitfield. + /// + /// Calling this function with an argument that is not [`SSTATE_NOT_INTERNED`], + /// [`SSTATE_INTERNED_MORTAL`], or [`SSTATE_INTERNED_IMMORTAL`] is invalid. + #[inline] + pub unsafe fn set_interned(&mut self, val: c_uint) { + let mut state = PyASCIIObjectState::from(self.state); + state.set_interned(val); + self.state = u32::from(state); + } + + /// Get the `kind` field of the [`PyASCIIObject`] state bitfield. + /// + /// Returns one of: [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], [`PyUnicode_2BYTE_KIND`], + /// [`PyUnicode_4BYTE_KIND`] #[inline] pub unsafe fn kind(&self) -> c_uint { - (self.state >> 2) & 7 + PyASCIIObjectState::from(self.state).kind() } + /// Set the `kind` field of the [`PyASCIIObject`] state bitfield. + /// + /// Calling this function with an argument that is not [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], + /// [`PyUnicode_2BYTE_KIND`], or [`PyUnicode_4BYTE_KIND`] is invalid. + #[inline] + pub unsafe fn set_kind(&mut self, val: c_uint) { + let mut state = PyASCIIObjectState::from(self.state); + state.set_kind(val); + self.state = u32::from(state); + } + + /// Get the `compact` field of the [`PyASCIIObject`] state bitfield. + /// + /// Returns either `0` or `1`. #[inline] pub unsafe fn compact(&self) -> c_uint { - (self.state >> 5) & 1 + PyASCIIObjectState::from(self.state).compact() + } + + /// Set the `compact` flag of the [`PyASCIIObject`] state bitfield. + /// + /// Calling this function with an argument that is neither `0` nor `1` is invalid. + #[inline] + pub unsafe fn set_compact(&mut self, val: c_uint) { + let mut state = PyASCIIObjectState::from(self.state); + state.set_compact(val); + self.state = u32::from(state); } + /// Get the `ascii` field of the [`PyASCIIObject`] state bitfield. + /// + /// Returns either `0` or `1`. #[inline] pub unsafe fn ascii(&self) -> c_uint { - (self.state >> 6) & 1 + PyASCIIObjectState::from(self.state).ascii() } + /// Set the `ascii` flag of the [`PyASCIIObject`] state bitfield. + /// + /// Calling this function with an argument that is neither `0` nor `1` is invalid. + #[inline] + pub unsafe fn set_ascii(&mut self, val: c_uint) { + let mut state = PyASCIIObjectState::from(self.state); + state.set_ascii(val); + self.state = u32::from(state); + } + + /// Get the `ready` field of the [`PyASCIIObject`] state bitfield. + /// + /// Returns either `0` or `1`. #[inline] pub unsafe fn ready(&self) -> c_uint { - (self.state >> 7) & 1 + PyASCIIObjectState::from(self.state).ready() + } + + /// Set the `ready` flag of the [`PyASCIIObject`] state bitfield. + /// + /// Calling this function with an argument that is neither `0` nor `1` is invalid. + #[inline] + pub unsafe fn set_ready(&mut self, val: c_uint) { + let mut state = PyASCIIObjectState::from(self.state); + state.set_ready(val); + self.state = u32::from(state); } } @@ -120,7 +362,6 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1; pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2; #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { debug_assert!(crate::PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -129,13 +370,11 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).compact() } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint { ((*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0).into() } @@ -149,25 +388,21 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2; pub const PyUnicode_4BYTE_KIND: c_uint = 4; #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 { PyUnicode_DATA(op) as *mut Py_UCS1 } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 { PyUnicode_DATA(op) as *mut Py_UCS2 } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 { PyUnicode_DATA(op) as *mut Py_UCS4 } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { debug_assert!(crate::PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -176,7 +411,6 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { if PyUnicode_IS_ASCII(op) != 0 { (op as *mut PyASCIIObject).offset(1) as *mut c_void @@ -186,7 +420,6 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null()); @@ -194,7 +427,6 @@ pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(crate::PyUnicode_Check(op) != 0); @@ -210,7 +442,6 @@ pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { // skipped PyUnicode_READ_CHAR #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { debug_assert!(crate::PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -219,7 +450,6 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { } #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).ready() } @@ -227,7 +457,6 @@ pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { #[cfg(not(Py_3_12))] #[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))] #[inline] -#[cfg(target_endian = "little")] pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int { debug_assert!(crate::PyUnicode_Check(op) != 0); diff --git a/src/ffi/tests.rs b/src/ffi/tests.rs index a0d32504497..97e838cf527 100644 --- a/src/ffi/tests.rs +++ b/src/ffi/tests.rs @@ -1,9 +1,7 @@ use crate::ffi::*; use crate::{types::PyDict, AsPyPointer, IntoPy, Py, PyAny, Python}; -#[cfg(target_endian = "little")] use crate::types::PyString; -#[cfg(target_endian = "little")] use libc::wchar_t; #[cfg_attr(target_arch = "wasm32", ignore)] // DateTime import fails on wasm for mysterious reasons @@ -108,7 +106,6 @@ fn test_timezone_from_offset_and_name() { }) } -#[cfg(target_endian = "little")] #[test] fn ascii_object_bitfield() { let ob_base: PyObject = unsafe { std::mem::zeroed() }; @@ -118,7 +115,7 @@ fn ascii_object_bitfield() { length: 0, #[cfg(not(PyPy))] hash: 0, - state: 0, + state: 0u32, wstr: std::ptr::null_mut() as *mut wchar_t, }; @@ -130,27 +127,26 @@ fn ascii_object_bitfield() { assert_eq!(o.ready(), 0); for i in 0..4 { - o.state = i; + o.set_interned(i); assert_eq!(o.interned(), i); } for i in 0..8 { - o.state = i << 2; + o.set_kind(i); assert_eq!(o.kind(), i); } - o.state = 1 << 5; + o.set_compact(1); assert_eq!(o.compact(), 1); - o.state = 1 << 6; + o.set_ascii(1); assert_eq!(o.ascii(), 1); - o.state = 1 << 7; + o.set_ready(1); assert_eq!(o.ready(), 1); } } -#[cfg(target_endian = "little")] #[test] #[cfg_attr(Py_3_10, allow(deprecated))] fn ascii() { @@ -191,7 +187,6 @@ fn ascii() { }) } -#[cfg(target_endian = "little")] #[test] #[cfg_attr(Py_3_10, allow(deprecated))] fn ucs4() { diff --git a/src/types/mod.rs b/src/types/mod.rs index 923dfcc096d..8d3e2714383 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -36,7 +36,7 @@ pub use self::pysuper::PySuper; pub use self::sequence::PySequence; pub use self::set::PySet; pub use self::slice::{PySlice, PySliceIndices}; -#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] +#[cfg(not(Py_LIMITED_API))] pub use self::string::PyStringData; pub use self::string::{PyString, PyString as PyUnicode}; pub use self::traceback::PyTraceback; diff --git a/src/types/string.rs b/src/types/string.rs index 2dc7a506476..3998a6adf04 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -1,6 +1,6 @@ // Copyright (c) 2017-present PyO3 Project and Contributors -#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] +#[cfg(not(Py_LIMITED_API))] use crate::exceptions::PyUnicodeDecodeError; use crate::types::PyBytes; use crate::{ffi, AsPyPointer, PyAny, PyResult, Python}; @@ -12,7 +12,7 @@ use std::str; /// /// Python internally stores strings in various representations. This enumeration /// represents those variations. -#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] +#[cfg(not(Py_LIMITED_API))] #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum PyStringData<'a> { /// UCS1 representation. @@ -25,7 +25,7 @@ pub enum PyStringData<'a> { Ucs4(&'a [u32]), } -#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] +#[cfg(not(Py_LIMITED_API))] impl<'a> PyStringData<'a> { /// Obtain the raw bytes backing this instance as a [u8] slice. pub fn as_bytes(&self) -> &[u8] { @@ -238,9 +238,7 @@ impl PyString { /// /// By using this API, you accept responsibility for testing that PyStringData behaves as /// expected on the targets where you plan to distribute your software. - /// - /// For example, it is known not to work on big-endian platforms. - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] pub unsafe fn data(&self) -> PyResult> { let ptr = self.as_ptr(); @@ -284,7 +282,7 @@ mod tests { use super::*; use crate::Python; use crate::{PyObject, ToPyObject}; - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] use std::borrow::Cow; #[test] @@ -347,7 +345,7 @@ mod tests { } #[test] - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] fn test_string_data_ucs1() { Python::with_gil(|py| { let s = PyString::new(py, "hello, world"); @@ -360,7 +358,7 @@ mod tests { } #[test] - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] fn test_string_data_ucs1_invalid() { Python::with_gil(|py| { // 0xfe is not allowed in UTF-8. @@ -386,7 +384,7 @@ mod tests { } #[test] - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] fn test_string_data_ucs2() { Python::with_gil(|py| { let s = py.eval("'foo\\ud800'", None, None).unwrap(); @@ -428,7 +426,7 @@ mod tests { } #[test] - #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))] + #[cfg(not(Py_LIMITED_API))] fn test_string_data_ucs4() { Python::with_gil(|py| { let s = "哈哈🐈";